/* Copyright (c) 1990 - 2009 by H. Robbers.
 *
 * This file is part of AHCC.
 *
 * AHCC is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * AHCC is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with AHCC; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*	A pretokenizer based on the C language.

	input: character string.
	output: aggragation of LEX_TOKEN structures.

	Its concept is kept as simple as possible.
	A text is divided in:
		identifiers (letter + opt(letters + digits))
		numbers
		new line's
		consecutive white space (space or tab)
		C operator characters in 2 kinds: single or multiple
		consecutive punctuation (anything else)

	lines are spliced on encountering \ followed by newline or carriage return
	comments are skipped /* /* */ */ //
	A copy of the original can be produced (see testframe)

	Because everything is done in a single function it is extremely fast,
	making maximum use of registers.

	On a TT it takes 0.15 seconds for a 60Kb C source file.
	On a 3 GHz Aranym machine everything will appear to be instantaneous :-)

	Due to the nature of the process, the output cannot become larger then
	6 times the input:

	If every character changed catagory each character would result in:

	2 byte record length
	1 byte token length
	1 byte catagory
	1 byte of source
	1 byte null termination
*/

#include <stdio.h>
#include <tos.h>
#include "c_lex.h"

void console(char *, ...);

/* selection & translation tables */
static
uchar C_simple[256] =	/* simple character catagorization */
{
	0,X,X,X,X,X,X,X,X,ws,nl,X,X,nl,X,X,
	X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,

/*     !    "   #  $   %   &   '   (  )   *   +  ,   -   .   /     */
	ws,mop,quo,mop,hex,mop,mop,apo,op,op,mop,mop,op,mop,mop,mop,

/*                                          :   ;   <   =   >  ?   */
	dig,dig,dig,dig,dig,dig,dig,dig,dig,dig,mop,op,mop,mop,mop,op,

/*  @   in embedded assembler: structure@member for offset */
	op,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,

/*                                              [   \  ]   ^   _   */
	ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,op,mop,op,mop,ide,

	X,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,

/*                                              {   |  }  ~        */
	ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,op,mop,op,op,X,

	ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,
	ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,ide,
	ide,ide,ide,ide,ide,ide,ide,ide,ide,X,X,X,X,X,X,X,
	ide,ide,ide,ide,ide,ide,ide,ide,ide,X,X,X,X,X,X,X,
	X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,
	X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,
	X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,
	X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
};

global
uchar C_hex[256] =
{
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	'0','1','2','3','4','5','6','7','8','9',0,0,0,0,0,0,
	0,'a','b','c','d','e','f',0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,'a','b','c','d','e','f',0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};

global
uchar C_oct[256] =
{
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	'0','1','2','3','4','5','6','7',0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};

global
long C_lexical(char *input, LEX_RECORD *output, long *lines, bool nest_com, bool for_S)
{
	LEX_RECORD *out = output;
	uchar *s = (uchar *)input;
	long l, tot = 0, lineno = 1;
	short t, ti;

	while (*s)					/* input is a true C string */
	{
		char *o = out->text;
		uint c = *s++;
		t = ti = C_simple[c];	/* get category */
		out->cat = t;

		if (t eq nl)			/* new line: gets line number as data */
		{
			lineno++;
			if (c eq '\r' and *s eq '\n')
				s++;
			if ((long)o & 1)
				*o++ = 0;
			*(long *)o = lineno;
			o += sizeof(long);
			if (for_S and *s eq '*')	/* assembler comment '*' must be first in a line */
			{
				while (*s)
				{
					if (*s eq '\n')
					{
						*o++ = ' ';
						out->cat = com;
						break;
					}
					s++;
				}
			}
		}
		else
		{
			if (t eq ide)
			{
				while (ti eq ide or ti eq dig)
				{
					*o++ = c;
					c = *s++;
					ti = C_simple[c];
				}
				s--;
				*o = 0;
			}
			else			/* punctuation or digits */
			{
				if (    t eq dig
				    and c eq '0'
				    and (*s eq 'x' or *s eq 'X')
				   )
				{
					*o++ = c;
					*o++ = *s++;
					while ((c= C_hex[*s]) ne 0)
						*o++ = c, *s++;
					out->cat = hex;
				}
				elif (t eq hex)
				{
					*o++ = '0';
					*o++ = 'x';
					while ((c= C_hex[*s]) ne 0)
						*o++ = c, *s++;
					out->cat = hex;
				}
				elif (t eq mop and c eq '/' and *s eq '*')
				{
					short lvl = 0;

					s++;
					while (*s)
					{
						if (*s eq '\n')
							lineno++;
						if (nest_com and *s eq '/' and *(s+1) eq '*')	/* 01'10 HR ooofff nested comments ? */
							s += 2, lvl++;
						elif (*s eq '*' and *(s+1) eq '/')
						{
							s += 2;
							if (lvl eq 0)
								break;
							lvl--;
						}
						else
							s++;
					}

					*o++ = ' ';
					out->cat = com;
				}
				elif (t eq quo or t eq apo)
				{
					*o++ = c;
					while (*s)
					{
						if (*s eq '\r' or *s eq '\n')
						{
							if (*s eq '\r' and *(s+1) eq '\n')
								s++;
						/*	if (t eq apo) */
								break;
						}
						if (    *s eq '\\'
						    and (*(s+1) eq c or  *(s+1) eq '\\')
						   )
						{
							*o++ = *s++;
							*o++ = *s++;
						}
						elif (*s eq c)
						{
							*o++ = *s++;
							break;
						}
						elif (    *s eq '\\'
						      and (*(s+1) eq '\r' or *(s+1) eq '\n')
						     )
						{
							s++;
							if (*s eq '\r')
								s++;
							if (*s eq '\n')
								s++;
							lineno++;
						}
						else
							*o++ = *s++;
					}
				}
				elif (for_S and t eq op and c eq ';')
				{
					while (*s)
					{
						if (*s eq '\n')
						{
							*o++ = ' ';
							out->cat = com;
							break;
						}
						s++;
					}
				}
#if MOPS_HERE
				elif (t eq mop)		/* multiple tokens */
				{
					switch((c<<8)|*s)	/* its C_lex */
					{					/*     =     */
						case '//':
							s++;
							while (*s)
							{
								if (*s eq '\n')
								{
									*o++ = ' ';
									out->cat = com;
									break;
								}
								s++;
							}
						break;
						case '##':
							if (*(s+1) eq '#' and *(s+2) eq '#')
							{
								*o++ = c;
								*o++ = *s++;
								*o++ = *s++;
								*o++ = *s++;
								c = *s;
								ti = C_simple[c];
								out->cat = op3;
							othw
								*o++ = c;
								*o++ = *s++;
								c = *s;
								ti = C_simple[c];
								out->cat = op2;
							}
						break;
						case '>>':			/* trops */
						case '<<':
							if (*(s+1) eq '=')
							{
								*o++ = c;
								*o++ = *s++;
								*o++ = *s++;
								c = *s;
								ti = C_simple[c];
								out->cat = op3;
								break;
							}
							/* else fall thru: 2 are valid */
						case '==':
						case '++':
						case '+=':
						case '-=':
						case '--':
						case '->':
						case '*=':
						case '%=':
						case '/=':
						case '&&':
						case '&=':
						case '|=':
						case '||':
						case '!=':
						case '^=':
						case ':=':		/* although this isnt C, it is very usefull to be able to detect it :-) */
						case '<=':
						case '>=':
							*o++ = c;
							*o++ = *s++;
							c = *s;
							ti = C_simple[c];
							out->cat = op2;
						break;
						case '\\\r':		/* bloody splice spoils everything */
							if (*(s+1) eq '\n')
								s += 2;		/* \r\n */
							else
								s += 1;		/* \r only */
							lineno++;
						continue;	/* not a token */
						case '\\\n':		/* \n only */
							lineno++;
							s++;
						continue;			/* not a token */
						case '..':
							if (*(s+1) eq '.')
							{
								*o++ = c;
								*o++ = *s++;
								*o++ = *s++;
								c = *s;
								ti = C_simple[c];
								out->cat = op3;
								break;
							}
							/* else fall thru: .. not valid) */
						default:				/* is a single */
							*o++ = c;
							c = *s;
							ti = C_simple[c];
							out->cat = op;
					}
				}
#endif
				elif (t eq op)		/* only 1 */
				{
					*o++ = c;
					c = *s;
					ti = C_simple[c];
				}
				else
				{
					while(ti eq t)
					{
						*o++ = c;
						c = *s++;
						ti = C_simple[c];
					}
					s--;
				}
			}
			*o++ = 0;
		}

		l = o - out->text - 1;		/* exclude \0 */
		if (l > 254)
		{
			printf("line %ld, token %ld > 254\n", lineno, l);
			l = 254;
		}

		l++;
/*		out->tl = l;
*/		l += size_LEX;
/*		if (l&1)
			l++, *o++ = 0;
*/		out->rl = l;
		(char *)out += l;
		tot += l;

		ti = t;
	}

	out->rl = size_LEX;
/*	out->tl = 0;
*/	out->cat = eof;

	tot += size_LEX;

	if (lines)
		*lines = lineno;

	return tot;
}
