/* readris.c: functions to read RIS datasets */
/* markus@mhoenicka.de 3-11-00 */

/*
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/


#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <limits.h>

#include "refdb.h"
#include "connect.h"
#include "readris.h"
#include "strfncs.h"
#include "linklist.h"
#include "tokenize.h"

#define RIS_LINE_SIZE 16384
#define FILE_CHUNK_SIZE 4096

/* forward declarations of local functions */
static int wrap_citation(char* inbuffer, char** ptr_buffer, size_t *ptr_bufsize);

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  read_ris_set(): reads a dataset in RIS format from a file

  int read_ris_set returns 0 if failed, 1 if complete set was
                   read, 2 if end of file was reached

  FILE* fp a file pointer to the file containing the dataset

  char* deffile the name of a file containing default RIS fields
              those fields will be appended unless deffile is an
              empty string

  char** ptr_inbuffer a pointer to a pointer to a buffer allocated with
                 malloc(). The calling function is responsible for
                 freeing the buffer after use. *ptr_inbuffer will be
                 modified by the function

  size_t* bufsize a pointer to a variable which receives the buffer size
                 *bufsize will be modified by the function

  int pull_in_default this should always be 0. the fn calls itself with
                this parameter set to 1.

 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int read_ris_set(FILE* fp, char* deffile, char** ptr_inbuffer, size_t* ptr_inbufsize, int pull_in_default) {
  int setdone = 0;
  int filedone = 0;
  size_t len_homedefault;
  char* linebuffer;
  char* result;
  char* newinbuf;
  char homedefault[_POSIX_PATH_MAX+1];
  FILE* default_fp = NULL;

  if (deffile != NULL && *deffile) {
    default_fp = fopen(deffile, "rb");
    if (default_fp == NULL) {
      strcpy(homedefault, getenv("HOME"));
      strcat(homedefault, "/");
      len_homedefault = strlen(homedefault);
      strncpy(&homedefault[len_homedefault], deffile, _POSIX_PATH_MAX - len_homedefault);
/*        strcat(homedefault, deffile); */
      default_fp = fopen(homedefault, "rb"); /* we ignore that this might fail too - the function will then proceed without the defaults */
      if (default_fp == NULL) {
	return 0;
      }
    }
  }

  linebuffer = malloc(RIS_LINE_SIZE);
  if (linebuffer == NULL) {
    return 0;
  }
  
  /* pretend we start with a newline */
  *linebuffer = '\n';

  /* read first line and make sure that its an empty line, otherwise
     the dataset is no real RIS dataset */
  /* NB I've disabled this check to simplify adding datasets from
     a GUI */
/*   result = fgets(linebuffer, RIS_LINE_SIZE, fp); */
/*   if (result == NULL && errno != EOF) { */
/*     free(linebuffer); */
/*     return 2; /\* (0) somehow there is no EOF at end of file ?? *\/ */
/*   } */
/*   if (result == NULL && errno == EOF) { */
/*     free(linebuffer); */
/*     return 2;  /\* end of file *\/ */
/*   } */
/*   else if (!(*linebuffer == '\n' || (*linebuffer == '\r' && linebuffer[1] == '\n'))) { */
/*     free(linebuffer); */
/*     return 0; /\* file doesn't start with empty line *\/ */
/*   } */

  /* remove any additional empty lines */
  while (*linebuffer == '\n' || (*linebuffer == '\r' && linebuffer[1] == '\n')) {
    result = fgets(linebuffer, RIS_LINE_SIZE, fp);
    if (result == NULL) {
      free(linebuffer);
      if (feof(fp)) {
	return 2;
      }
      else {
	return 0;
      }
    }
  }
  *ptr_inbufsize += strlen(linebuffer);
  newinbuf = realloc(*ptr_inbuffer, *ptr_inbufsize);
  if (newinbuf == NULL) {
    free(linebuffer);
    return 0;
  }
  else {
    *ptr_inbuffer = newinbuf;
  }
  if ((pull_in_default && strncmp(linebuffer, "TY  - ", 6) != 0 && strncmp(linebuffer, "ER  - ", 6) != 0) || !pull_in_default) {
    strcat(*ptr_inbuffer, linebuffer);
  }

  while (!setdone) {
    result = fgets(linebuffer, RIS_LINE_SIZE, fp);
    if (result == NULL && !feof(fp)) {
       free(linebuffer);
       return 0;
/*       filedone++; */
/*       setdone++; */
    }
    else if (result == NULL /* EOF */) {
      filedone++;
      setdone++;
    }
    else if (strncmp(linebuffer, "ER  - ", 6) == 0) {
      /* pull in global fields if requested */
      if (!pull_in_default && default_fp != NULL) {
	read_ris_set(default_fp, "", ptr_inbuffer, ptr_inbufsize, 1);
	fclose(default_fp);
      }
      setdone++;
    }
    
    /* remove any CR */
    if ((result = strchr(linebuffer, (int)'\r')) != NULL) {
      *result = '\0';
    }

    if ((pull_in_default && strncmp(linebuffer, "TY  - ", 6) != 0 && strncmp(linebuffer, "ER  - ", 6) != 0) || !pull_in_default) {
      *ptr_inbufsize += strlen(linebuffer);
      newinbuf = realloc(*ptr_inbuffer, *ptr_inbufsize);
      if (newinbuf == NULL) {
	free(linebuffer);
	return 0;
      }
      else {
	*ptr_inbuffer = newinbuf;
      }
      strcat(*ptr_inbuffer, linebuffer);
/*      printf("%s\n", linebuffer); */
    }
  }

  free(linebuffer);

  *ptr_inbufsize += TERM_LEN;

  newinbuf = realloc(*ptr_inbuffer, *ptr_inbufsize);
  if (newinbuf == NULL) {
    return 0;
  }
  else {
    *ptr_inbuffer = newinbuf;
  }
  memset((*ptr_inbuffer) + *ptr_inbufsize - TERM_LEN, (int)'\0', TERM_LEN);

  if (filedone) {
    return 2;
  }
  else {
    return 1;
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  add_id_from_ris(): adds the IDs from a RIS file to a provided buffer
                     allocated with malloc()

  int add_id_from_ris returns 0 if successful, 1 if an error occurred
                     or if no IDs were found

  FILE* infilefp ptr to an open stream with the incoming data

  char** ptr_buffer pointer to a buffer to which the result will be added.
                This pointer will be modified if it is necessary to
                reallocate the buffer.

  size_t *maxlen pointer to current length of ptr_buffer; may be modified

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int add_id_from_ris(FILE* infilefp, char** ptr_buffer, size_t *ptr_bufsize) {
  char* linebuffer;
  char* read_result;
  char* the_end;
  char* new_buffer;
  size_t buflen = 0;

  linebuffer = malloc((size_t)RIS_LINE_SIZE+1); /* we append a space */

  if (linebuffer == NULL) {
    return 1;
  }

  /* loop over all lines in the file */
  do {
    read_result = fgets(linebuffer, RIS_LINE_SIZE, infilefp);
    if (read_result != NULL) {
      /* see if its a line with ID and if we've got enough space left */
      if (strncmp(read_result, "ID  - ", 6) == 0) {
	/* terminate the string at the first \n or \r, if any */
	the_end = read_result+6;
	while (*the_end != '\0') {
	  if (*the_end == '\r' || *the_end == '\n') {
	    *the_end = '\0';
	    break;
	  }
	  the_end++;
	}

	strcat(read_result, " ");
	if ((new_buffer = mstrcat(*ptr_buffer, read_result+6, ptr_bufsize, 0)) == NULL) {
	  free(linebuffer);
	  return 1;
	}
	else {
	  *ptr_buffer = new_buffer;
	}
	buflen += strlen(*ptr_buffer);
      }
    }
  } while (read_result != NULL);

  free(linebuffer);

  if (buflen) {
    return 0;
  }
  else { /* no IDs found */
    return 1;
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  add_id_from_aux(): adds the IDs from an .aux file to a provided buffer
                     allocated with malloc(). The result will be a
		     whitespace-separated list

  int add_id_from_aux returns 0 if successful, 1 if no IDs were found,
                2 if a memory problem occurred

  FILE* infilefp ptr to an open stream with the incoming data

  char** ptr_buffer pointer to a buffer to which the result will be added.
                This pointer will be modified if it is necessary to
                reallocate the ptr_buffer.

  size_t *ptr_bufsize pointer to current length of ptr_buffer; may be modified

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int add_id_from_aux(FILE* infilefp, char** ptr_buffer, size_t *ptr_bufsize) {
  char* linebuffer;
  char* read_result;
  int n_eof = 0;
  int n_have_id = 0;
  int n_result;

  linebuffer = malloc((size_t)RIS_LINE_SIZE+1); /* we append a space */

  if (linebuffer == NULL) {
    return 1;
  }

  /* loop over all lines in the file */
  while (!n_eof) {
    if ((read_result = fgets(linebuffer, RIS_LINE_SIZE, infilefp)) == NULL) {
      n_eof = 1;
    }
    else {
      if (!strncmp(linebuffer, "\\citation{", 10)) {
	n_result = wrap_citation(linebuffer, ptr_buffer, ptr_bufsize);
	if (n_result == 0) {
	  n_have_id = 1;
	}
	else if (n_result == 2) {
	  return 2;
	}
	/* else: improper ID, just ignore this */
      }
    }
  }


  free(linebuffer);

  if (n_have_id) {
    (*ptr_buffer)[strlen(*ptr_buffer)-1] = '\0'; /* remove trailing comma */
    return 0;
  }
  else { /* no IDs found */
    return 1;
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  wrap_citation(): wraps the citation ID into a whitespace-separated list

  static int wrap_citation returns 0 if ok, 1 if ID is malformed, 2 if a
                memory problem occurs

  char* inbuffer ptr to a string containing the citation

  char** ptr_buffer pointer to a buffer to which the result will be added.
                This pointer will be modified if it is necessary to
                reallocate the buffer.

  size_t *ptr_bufsize pointer to current length of ptr_buffer; may be modified

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
static int wrap_citation(char* inbuffer, char** ptr_buffer, size_t *ptr_bufsize) {
  char* start;
  char* end;
  char* new_buffer;

  if ((start = strchr(inbuffer, (int)'{')) == NULL) {
    fprintf(stderr, "citation without proper ID\n");
    return 1; /* no ID */
  }
  else {
    if (start[1] != '\0') {
      start = &(start[1]);
    }
    else {
      fprintf(stderr, "citation without proper ID\n");
      return 1; /* incomplete ID string */
    }
  }

  if ((end = strchr(start, (int)'}')) == NULL) {
    fprintf(stderr, "citation without proper ID\n");
    return 1; /* incomplete ID */
  }
  else {
    *end = '\0';
  }

  /* in case of a multi-head citation, the tex citation element
     contains a comma-separated list of IDs. Replacing the comma with
     a space turns this into proper members of our
     whitespace-separated list */
  end = start; /* reuse end */
  while (*end) {
    if (*end == ',') {
      *end = ' ';
    }
    end++;
  }

  if ((new_buffer = mstrcat(*ptr_buffer, start, ptr_bufsize, 0)) == NULL) {
    fprintf(stderr, "out of memory\n");
    return 2;
  }
  else {
    *ptr_buffer = new_buffer;
  }

  if ((new_buffer = mstrcat(*ptr_buffer, " ", ptr_bufsize, 0)) == NULL) {
    fprintf(stderr, "out of memory\n");
    return 2;
  }
  else {
    *ptr_buffer = new_buffer;
  }

  return 0;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  read_tokens(): reads tokens from an open file. A token is a string
                 between one or more separators on both sides. 
                 Separators are the file start, the end of file,
                 a space, a tab, a CR, and a LF. Use it to read in 
                 whitespace-delimited lists.

  char* read_tokens returns NULL if an error occurs, otherwise a
                 pointer to buffer. *buffer may get reallocated
                 while reading the file, so it is important that
                 after this function returns *ONLY* the return value
                 is used to address the result and *NEVER* *buffer
                 itself.

  int infilefd file descriptor of an open file with read access
                 which contains the whitespace-delimited data to read 

  char *buffer buffer allocated with malloc() which will received a
                space-separated string containing the read tokens.
                buffer will be reallocated as needed, therefore use
                *ONLY* the return value to access the data after this
                function returns, *NEVER* the original *buffer

  size_t *ptr_buffer_len pointer to a variable holding the current size of
                buffer. Will be modified if a realloc() is necessary

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char* read_tokens(int infile_fd, char* buffer, size_t *ptr_buffer_len) {

  char *queue;
  ssize_t nread;
  char* token;
  char* next_token;
  char *new_buffer;
  char *separator;
  size_t token_len;
  int error = 0;

  queue = malloc((size_t)FILE_CHUNK_SIZE);
  if (queue == NULL) {
    return NULL;
  }

  do { /* loop until file is read completely */
    /* read a chunk from the file */
    nread = read(infile_fd, (void*)queue, FILE_CHUNK_SIZE - 1);
    if (nread == -1) { /* can't read from this file */
      error = 1;
      break;
    }

    /* terminate to make a string */
    queue[nread] = '\0';
    next_token = queue;
    separator = strpbrk(next_token, " \n\r\t");
    if ((separator == NULL || separator != next_token) && *ptr_buffer_len > 0) {
      /* eliminate the trailing space in buffer if this chunk does not start
         with some whitespace */
      buffer[strlen(buffer) - 1] = '\0';
    }

    do { /* loop over all tokens */
      token = nstrtok(next_token, &token_len, " \n\r\t");
      if (token != NULL) {
	token[token_len] = '\0';
/*  	printf("%s\n", token); */
	/* append token to buffer */
	if ((new_buffer = mstrcat(buffer, token, ptr_buffer_len, 0)) == NULL) {
	  error = 1;
	  break;
	}
	else {
	  buffer = new_buffer;
	}
	
	/* append a space to the buffer */
	if ((new_buffer = mstrcat(buffer, " ", ptr_buffer_len, 0)) == NULL) {
	  error = 1;
	  break;
	}
	else {
	  buffer = new_buffer;
	}
	next_token = token + token_len + 1;
/*  	printf("%s\n", buffer); */
      }
    } while (token != NULL);

  } while (nread > 0 && !error);

  free(queue);

  if (error) {
    return NULL;
  }
  return buffer;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  risdate(): converts a RIS PY string into a year short int and an
             otherinfo string

  short int risdate returns the year or 0 if none was specified

  char* otherinfo_buffer will receive the other date information. Must
              hold up to 256 chars including the terminal \0

  char* string the PY string to convert

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
short int risdate(char* otherinfo_buffer, char* string) {
  char* monthsep;
  short int year;

  if ((monthsep = strchr(string, (int)'/')) == NULL) {
    if (strlen(string) == 4) { /* have year info */
      year = atoi(string); 
      otherinfo_buffer[0] = '\0';
    }
    else {
      /* no legal RIS date */
      otherinfo_buffer[0] = '\0';
      year = 0;
    }
  }
  else if (monthsep-string == 4) { /* if there is year info and more */
    year = atoi(string); /* atoi should read only until the separator */
    strcpy(otherinfo_buffer, string+4);
  }
  else { /* if there is no year info */
    strcpy(otherinfo_buffer, string);
    year = 0;
  }
  return year;
}
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  ris_rpdate(): converts a RIS RP string into a status int and a
             date string

  int ris_rpdate returns -1 if an error occurred, otherwise returns
             the reprint status (0 = IN FILE, 1 = NOT IN FILE,
             2 = ON REQUEST)

  char* date_buffer will receive the date. Must hold 11 chars
              including the terminal \0. For reprint status 0 and
              1 and if an error occurred, date_buffer will be
              an empty string.

  char* rp_string the string to convert

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int ris_rpdate(char* date_buffer, char* rp_string) {
  int type;
  int provide_default = 0;
  char* date;
  char* up_string;
  time_t the_time;

  /* terminate in case we don't find a date */
  date_buffer[0] = '\0';

  /* need a copy for case-insensitive comparisons */
  up_string = strdup(rp_string);
  if (up_string) {
    strup(up_string);
    if (strncmp(up_string, "IN FILE", 7) == 0) {
      type = 0;
    }
    else if (strncmp(up_string, "NOT IN FILE", 11) == 0) {
      type = 1;
    }
    else if (strncmp(up_string, "ON REQUEST", 10) == 0) {
      type = 2;
    }
    else {
      type = -1;
    }
  }
  else {
    type = -1;
  }

  free(up_string);

  if (type == 2) { /* on request */
    if ((date = strchr(rp_string, (int)'(')) == NULL) {
      provide_default++;
    }
    else if (date[3] != '/' || date[6] != '/') {
      provide_default++;
    }

    if (provide_default) { /* lacking a better idea, use current UTC */
      time(&the_time);
      print_risdate(&the_time, date_buffer, 4);
    }
    else {
      /* this is the RIS version of the Y2K issue. Don't forget to modify this
       code in approx 100 years */
      if (atoi(date+7) < 70) {
	strcpy(date_buffer, "20");
      }
      else {
	strcpy(date_buffer, "19");
      }
      strncat(date_buffer, date+7, 2); /* year */
      strcat(date_buffer, "-");
      strncat(date_buffer, date+1, 2); /* month */
      strcat(date_buffer, "-");
      strncat(date_buffer, date+4, 2); /* day */
    }
  }
  return type;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  print_risdate(): prints a date given as a time_t structure into a
                   string in YYYY-MM-DD format

  char* print_risdate returns a ptr to the date string

  time_t* date ptr to a structure containing the date/time

  char* datestring ptr to a buffer which will receive the output
                   The string must hold at least 12 chars

  int yeardigits number of digits to print for the year

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char* print_risdate(time_t* date, char* datestring, int yeardigits) {
  struct tm *tm_ptr;
  char year[16];

  tm_ptr = gmtime(date);

  if (yeardigits == 2) {
    /* this is currently only used by the weird RIS output */
    /* the required format is ON REQUEST (MM/DD/YY) */
    sprintf(year, "%04d", tm_ptr->tm_year + 1900);
    sprintf(datestring, "%02d/%02d/%s", tm_ptr->tm_mon + 1, tm_ptr->tm_mday, year+2);
  }
  else {
    sprintf(datestring, "%04d-%02d-%02d", tm_ptr->tm_year + 1900, tm_ptr->tm_mon + 1, tm_ptr->tm_mday);
  }
  return datestring;
}



