/*
   utfcheck - Check validity of a text file containing Unicode

   Author: Paul Hardy, unifoundry <at> unifoundry.com, June 2018
   
   Copyright (C) 2018 Paul Hardy

   LICENSE:

      This program is free software: you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published by
      the Free Software Foundation, either version 2 of the License, or
      (at your option) any later version.

      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU General Public License for more details.

      You should have received a copy of the GNU General Public License
      along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

%{
#include <config.h>   /* Generated by GNU Autotools */

#include <locale.h>
#include <stdio.h>
#include <unistd.h>
#include <ctype.h>


#define YY_NO_INPUT
#define YY_NO_UNPUT

#define MAXSTRING 4096


/*
   ASCII character type, the default.
*/
#define TYPE_ASCII	0x00  /* ASCII printable character */
#define TYPE_CONTROL_A	0x01  /* ASCII control code        */
/*
   UTF-8 sequences
*/
#define TYPE_BOM8	0x02  /* UTF-8 Byte Order Mark  */
#define TYPE_UTF8	0x03  /* UTF-8                  */
#define TYPE_CONTROL_8	0x04  /* Unicode control code   */
#define TYPE_NONCHAR	0x05  /* Unicode "noncharacter" */
#define TYPE_SURROGATE8 0x06  /* Surrogate pair in UTF-8; not valid Unicode */
/*
   UTF-16 sequences
*/
#define TYPE_BOM16_BE	   0x10  /* UTF-16 little-endian BOM     */
#define TYPE_BOM16_LE	   0x11  /* UTF-16 big-endian BOM        */
#define TYPE_UTF16_BE	   0x12  /* UTF-16, big-endian format    */
#define TYPE_UTF16_LE	   0x13  /* UTF-16, little-endian format */
#define TYPE_SURROGATE_BE  0x14  /* UTF-16 surrogate pair        */
#define TYPE_SURROGATE_LE  0x15  /* UTF-16 surrogate pair        */
/*
   Not recognized as ASCII, UTF-8, or UTF-16
*/
#define TYPE_BINARY	0x40  /* Binary data--not valid in a text file */


/*
   Index values for character set type that follow.
*/
#define SET_ASCII	0
#define SET_UTF8	1
#define SET_UTF16_BE	2
#define SET_UTF16_LE	3
#define SET_BINARY	4
/*
   character set in this file; strings for types follow
*/
int char_set=0;
/*
   Printable strings to denote character set.
*/
static char *char_set_string[6] = {
   "ASCII", "UTF-8", "UTF-16-BE", "UTF-16-LE", "BINARY", ""
};


unsigned nchars = 0;          /* Number of characters in input file         */
int first_char  = 1;          /* At start of input file                     */
int bom_begin   = 0;          /* = 1 if file starts with Byte Order Mark    */
int bom_inside  = 0;          /* Increments if Byte Order Marks after start */
int null_seen   = 0;          /* File contains '\0'                         */
int cr_seen     = 0;          /* File contains '\r'                         */
int ctrl_seen   = 0;          /* File contains non-printable control codes  */
int esc_seen    = 0;          /* File contains escape sequences             */
int pua0_seen   = 0;          /* # of Plane  0 Private Use Area (PUA) chars */
int pua15_seen  = 0;          /* # of Plane 15 Private Use Area (PUA) chars */
int pua16_seen  = 0;          /* # of Plane 16 Private Use Area (PUA) chars */
int utf_type    = TYPE_ASCII; /* Assume ASCII until non-ASCII is seen       */

/*
   if =1, just exit with an exit status & possible error message.
*/
int quiet = 0;

/*
   If ascii_only equals 1 (set with the "-a" option), exit
   with EXIT_FAILURE status if a non-ASCII character is read.
*/
int ascii_only  = 0;

/*
   Check the "expurgated" version of the Unicode Standard, the one
   without the Byte Order Mark, after the Monty Python "Bookshop"
   skit with the "expurgated" version of _Olsen's Standard Book of
   British Birds,_ the one without the gannet--because the customer
   didn't like them.  "The one without the Byte Order Mark?!  They've
   ALL got the Byte Order Mark!  It's a standard part of the Unicode
   Standard, the Byte Order Mark, it's in all the books!"

   This will cause the check to fail if a file contains a Byte
   Order Mark even though it is legal Unicode in a UTF-8 file.
   Enable this with the "--expurgated" option.
*/
int expurgated  = 0;

void type_check (int);        /* Check this character type for consistency  */

%}

%option noyywrap

ASCII		[\040-\176]
CONTROL_ASCII	[\000-\037]|\177
BOM_UTF8	\357\273\277
CONTROL_UTF8	\302[\200-\237]
SURROGATE8	\355[\240-\277][\200-\277]
PUA0		(\356([\200-\277]{2}))|(\357([\200-\243][\200-\277]))
PUA15		\363[\260-\277][\200-\277][\200-\277]
PUA16		\364[\200-\217][\200-\277][\200-\277]
BINARY		[\177-\377]
BOM_UTF16_BE	\376\377
BOM_UTF16_LE	\377\376
SURROGATE_BE	\303[\230-\233][\000-\377]{2}\303[\234-\237][\000-\377]{2}
SURROGATE_LE	[\230-\233]\303[\000-\377]{2}[\234-\237]\303[\000-\377]{2}

%%
{SURROGATE8}	type_check (TYPE_SURROGATE8); /* Illegal surrogate pair   */
{ASCII}		type_check (TYPE_ASCII);      /* ASCII                    */
{BOM_UTF8}	type_check (TYPE_BOM8);       /* UTF-8 Byte Order Mark    */
{BOM_UTF16_BE}	type_check (TYPE_BOM16_BE);   /* UTF-16 big-endian BOM    */
{BOM_UTF16_LE}	type_check (TYPE_BOM16_LE);   /* UTF-16 little-endian BOM */
{CONTROL_ASCII}	type_check (TYPE_CONTROL_A);  /* ASCII control code       */
{CONTROL_UTF8}	type_check (TYPE_CONTROL_8);  /* UTF-8 control code       */
{BINARY}	type_check (TYPE_BINARY);     /* Non-text--stop           */
\357\277[\276\277]				type_check (TYPE_NONCHAR);
\360[\237\257\277]\277[\276\277]		type_check (TYPE_NONCHAR);
[\361-\363][\217\237\257\277]\277[\276\277]	type_check (TYPE_NONCHAR);
\364\217\277[\276\277]				type_check (TYPE_NONCHAR);
\357\267[\220-\257]		type_check (TYPE_NONCHAR);/* U+FDD0..U+FDEF   */
{PUA0}				{ type_check (TYPE_UTF8); pua0_seen++;  }
{PUA15}				{ type_check (TYPE_UTF8); pua15_seen++; }
{PUA16}				{ type_check (TYPE_UTF8); pua16_seen++; }
[\302-\337][\200-\277]		type_check (TYPE_UTF8); /*  U+0080..U+07FF   */
[\340-\357]([\200-\277]{2})	type_check (TYPE_UTF8); /*  U+0800..U+FFFF   */
[\360-\363][\200-\277]{3}       type_check (TYPE_UTF8); /* U+10000.. U+FFFFF */
%%

int
main (int argc, char *argv[])
{
   int i;            /* loop variable       */
   int exit_status;  /* program exit status */

   void print_help (char *);

   exit_status = EXIT_SUCCESS;
   yyin  = stdin;
   yyout = yyout;

   for (i = 1; i < argc; i++) {
      /*
         Parse options.  If an invalid command line argument
         was given, print a help menu and exit with error status.
      */
      if (argv[i][0] == '-') {
         switch (argv[i][1]) {
            case '-': if (strcmp (&argv[i][2], "expurgated") == 0)
                         expurgated = 1;
                      break;
            case 'a': ascii_only = 1;
                      break;
            case 'i': yyin  = fopen (argv[++i], "r");
                      break;
                      /*
                         output file format; file name follows
                         in next parameter, so increment i
                      */
            case 'o': yyout = fopen (argv[++i], "w");
                      break;
            case 'q': quiet = 1;
                      break;
            default:  print_help (argv[0]);
                      exit_status = EXIT_FAILURE;
                      break;
         }
      }
      else {
         print_help (argv[0]);
         exit_status = EXIT_FAILURE;
      }
   }

   if (exit_status == EXIT_SUCCESS) {
      yylex ();
   }

   /*
      Print summary.
   */
   if (quiet == 0) {
      fprintf (yyout, "\nFILE-SUMMARY:\n\n");
      fprintf (yyout, "Character-Set: %s\n", char_set_string [char_set]);

      if (bom_begin != 0)
         fprintf (yyout, "BOM-AT-START\n");
      if (bom_inside != 0)
         fprintf (yyout, "BOM-AFTER-START\n");
      if (null_seen != 0)
         fprintf (yyout, "CONTAINS-NULLS\n");
      if (cr_seen != 0)
         fprintf (yyout, "CONTAINS-CARRIAGE_RETURN\n");
      if (ctrl_seen != 0)
         fprintf (yyout, "CONTAINS-CONTROL_CHARACTERS\n");
      if (esc_seen != 0)
         fprintf (yyout, "CONTAINS-ESCAPE_SEQUENCES\n");
      if (pua0_seen != 0)
         fprintf (yyout, "Plane-0-PUA: %d characters\n", pua0_seen);
      if (pua15_seen != 0)
         fprintf (yyout, "Plane-15-PUA: %d characters\n", pua15_seen);
      if (pua16_seen != 0)
         fprintf (yyout, "Plane-16-PUA: %d characters\n", pua16_seen);
   }

   exit (exit_status);
}


/*
   Print a help message.  The parameter is the program name,
   taken from argv[0].
*/
void
print_help (char * progname)
{

   fprintf (stderr, "\nUnknown command line parameter.\n\n");
   fprintf (stderr, "Syntax: utfcheck [-a] [-q] [--expurgated] [-i input_file] [-o output_file]\n\n");
   fprintf (stderr, "    -a: ASCII only check\n\n");
   fprintf (stderr, "    -i: specify input file name\n\n");
   fprintf (stderr, "    -o: specify output file name\n\n");
   fprintf (stderr, "    -q: quiet mode; output only on fatal error\n\n");
   fprintf (stderr, "    --expurgated: fail if Byte Order Mark present\n\n");

   return;
}


/*
   Note the type of this byte or byte sequence and see if it
   differs from what was seen previously.
*/
void
type_check (int this_type)
{
   int i;       /* Loop variable */
   int codept;  /* Unicode code point conversion of UTF-8 byte string */

   void parse_be ();       /* Parse rest of file as big-endian    UTF-16 */
   void parse_le ();       /* Parse rest of file as little-endian UTF-16 */
   int  cvt4utf8 (char *); /* Convert UTF-8 to UTF-32 code point         */

   /*
      If a byte sequence in the range of the Unicode surrogate pair range
      was detected (U+D800..U+DFFF), fail immediately because it is not
      valid unless in a UTF-16 file.  This version of utfcheck does not
      support UTF-16.
   */
   if (this_type == TYPE_SURROGATE8) {
      fprintf (yyout, "SURROGATE-PAIR-CODE-POINT: 0x");
      for (i = 0; yytext[i] != '\0'; i++)
         fprintf (yyout, "%02X ", yytext[i] & 0xFF);
      codept = cvt4utf8 (yytext);
      fprintf (yyout, " (U+%04X)\n", codept);
      exit (EXIT_FAILURE);
   }
   if (first_char) {  /* Look for Byte Order Mark at start of file */
      switch (this_type) {
         case TYPE_BOM8:
            utf_type = TYPE_UTF8;
            char_set = SET_UTF8;
            bom_begin = 1;  /* File starts with UTF-8 Byte Order Mark */
            fprintf (yyout, "UTF-8-BOM-BEGIN\n");
            /* Oh no!  We didn't want a BOM and we got one!  The horror! */
            if (expurgated == 1)
               exit (EXIT_FAILURE);
            break;
         case TYPE_BOM16_BE:  /* Hooks for handling UTF-16 in the future */
            utf_type = TYPE_UTF16_BE;
            char_set = SET_UTF16_BE;
            bom_begin = 1;  /* UTF-16 big-endian Byte Order Mark */
            fprintf (yyout, "UTF-16-BE: Unsupported\n");
            exit (EXIT_FAILURE);
         case TYPE_BOM16_LE:  /* Hooks for handling UTF-16 in the future */
            utf_type = TYPE_UTF16_LE;
            char_set = SET_UTF16_LE;
            bom_begin = 1;  /* UTF-16 little-endian Byte Order Mark */
            fprintf (yyout, "UTF-16-LE: Unsupported\n");
            exit (EXIT_FAILURE);
         case TYPE_UTF16_BE:  /* Hooks for handling UTF-16 in the future */
            utf_type = TYPE_UTF16_BE;
            char_set = SET_UTF16_BE;
            fprintf (yyout, "UTF-16-BE: Unsupported\n");
            exit (EXIT_FAILURE);
         case TYPE_UTF16_LE:  /* Hooks for handling UTF-16 in the future */
            utf_type = TYPE_UTF16_LE;
            char_set = SET_UTF16_LE;
            fprintf (yyout, "UTF-16-LE: Unsupported\n");
            exit (EXIT_FAILURE);
      }
   }
   if (utf_type != this_type) {  /* See if type transition is valid */
      switch (this_type) {
         case TYPE_ASCII:
            break;
         case TYPE_CONTROL_A:
            /* In order: Carriage Return or Escape; note it but don't print */
            if (yytext[0] == '\015' || yytext[0] == '\033')
               cr_seen = 1;
            /* In order: Horizontal Tab, Line Feed, Vertical Tab, or Form Feed */
            else if (yytext[0] != '\011' && yytext[0] != '\012' &&
                     yytext[0] != '\013' && yytext[0] != '\014') {
               ctrl_seen = 1;  /* Non-printable control character embedded in file */
               if (yytext[0] == '\0')
                  null_seen = 1;
               if (quiet == 0) {
                  if (yytext[0] == '\0')
                     fprintf (yyout, "ASCII-NULL\n");
                  else
                     fprintf (yyout, "ASCII-CONTROL: U+%04X\n", yytext[0]);
               }
            }
            break;
         case TYPE_BOM8:
            utf_type = TYPE_UTF8;
            char_set = SET_UTF8;
            if (first_char == 0) {  /* If past the first UTF-8 character in file */
               bom_inside++;   /* UTF-8 Byte Order Mark seen after start of file */
               if (quiet == 0 || expurgated == 1)
                  fprintf (yyout, "UTF-8-BOM-EMBEDDED\n");
            }
            /* We didn't want a BOM but we got one. */
            if (expurgated == 1)
               exit (EXIT_FAILURE);
            break;
         case TYPE_UTF8:      /* A UTF-8 code point */
            utf_type = TYPE_UTF8;
            char_set = SET_UTF8;
            break;
         case TYPE_CONTROL_8:
            utf_type = TYPE_UTF8;
            char_set = SET_UTF8;
            ctrl_seen = 1;  /* Non-printable control character embedded in file */
            if (quiet == 0) {
               fprintf (yyout, "UTF-8-CONTROL: 0x");
               fprintf (yyout, "%02X %02X ", yytext[0] & 0xFF, yytext[1] & 0xFF);
               codept = cvt4utf8 (yytext);
               fprintf (yyout, " (U+%04X)\n", codept);
            }
            break;
         case TYPE_NONCHAR:
            utf_type = TYPE_UTF8;
            char_set = SET_UTF8;
            if (quiet == 0) {
               fprintf (yyout, "UTF-8-NONCHARACTER: 0x");
               for (i = 0; yytext[i] != '\0'; i++)
                  fprintf (yyout, "%02X ", yytext[i] & 0xFF);
               codept = cvt4utf8 (yytext);
               fprintf (yyout, " (U+%04X)\n", codept);
            }
            break;
         case TYPE_UTF16_BE:  /* Hooks for handling UTF-16 in the future */
            utf_type = TYPE_UTF16_BE;
            char_set = SET_UTF16_BE;
            fprintf (yyout, "UTF-16-BE: Unsupported\n");
            exit (EXIT_FAILURE);
         case TYPE_UTF16_LE:  /* Hooks for handling UTF-16 in the future */
            utf_type = TYPE_UTF16_LE;
            char_set = SET_UTF16_LE;
            fprintf (yyout, "UTF-16-LE: Unsupported\n");
            exit (EXIT_FAILURE);
         case TYPE_BINARY:
            utf_type = TYPE_BINARY;
            char_set = SET_BINARY;
            fprintf (yyout, "BINARY-DATA: 0x%02X\n", yytext[0] & 0xFF);
            exit (EXIT_FAILURE);
            break;
      }
      first_char = 0;  /* Done processing first character in input file */
   }

   if ((ascii_only == 1) && (char_set != SET_ASCII)) {
      fprintf (yyout, "NON-ASCII-DATA: 0x%02X\n", yytext[0] & 0xFF);
      exit (EXIT_FAILURE);
   }

   nchars++;

   return;
}


/*
   cvt4utf8 - convert from UTF-8 to unsigned (UTF-32)

      utf8_bytes[] - UTF-8 byte string

      return value: unsigned Unicode code point if valid, -1 otherwise
*/
int
cvt4utf8 (char *utf8_bytes)
{
   int i;               /* loop variable */
   int nbytes;          /* number of bytes in this UTF-8 byte string     */
   unsigned mask;       /* to get number of '1' bits in first UTF-8 byte */
   unsigned temp_byte;  /* UTF-8 byte currently being examined           */
   unsigned retval;     /* return value                                  */

   temp_byte = utf8_bytes[0] & 0xFF;  /* to extract byte count              */
   mask      = 0x80;                  /* start at high bit                  */
   nbytes    = 0;                     /* number of bytes in UTF-8 character */
   while ((temp_byte & mask) != 0) {
      nbytes++;
      temp_byte ^= mask;
      mask >>= 1;
   }
   retval = temp_byte;

   if (nbytes == 0) {
      nbytes = 1;  /* ASCII character */
   }
   else if (nbytes > 4) {
      retval = -1;  /* binary data, not UTF-8 */
   }
   else {  /* multi-byte UTF-8 character; convert it */
      for (i = 1; i < nbytes; i++) {
         retval <<= 6;  /* prepare for next 6 bits of Unicode code point */
         retval  |= utf8_bytes[i] & 0x3F;
      }
   }

   return retval;
}

