Working preliminary version of parser

author: tomsmeding <tom.smeding@gmail.com> 2016-08-05 20:26:05 +0200
committer: tomsmeding <tom.smeding@gmail.com> 2016-08-06 10:10:32 +0200
commit: f67988fbfde6ad8a91466ef5d4227dcf9e5db6ce (patch)
tree: db85d3936f717331c3eeed4fae43e5ed43324be9 /parser.c
parent: e6bb770a52980ef3d85c2d4b93fb240c026ce7f7 (diff)
1 files changed, 430 insertions, 108 deletions
diff --git a/parser.c b/parser.c
index 96b9bf7..14fd47d 100644
--- a/parser.c
+++ b/parser.c
@@ -2,193 +2,515 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <assert.h>
 
 #include "memory.h"
+#include "opfuncs.h"
 #include "parser.h"
 
 
+#define NOT_IMPLEMENTED false
+
+
+static bool ishexdigit(char c){
+	return (c>='0'&&c<='9')||(c>='a'&&c<='f')||(c>='A'&&c<='F');
+}
+
+static int hexnumber(char c){
+	return c<='9'?c-'0':(c&~('a'-'A'))-'A'+10;
+}
+
+static char hexencode(int n){
+	return n<10?n+'0':n+'a';
+}
+
+
 typedef enum Tokentype{
 	TT_NUM,
 	TT_STR,
 	TT_WORD,
-	TT_SYM
+	TT_OP,
+	TT_SYM, //all symbols that are not operators
+	TT_ENDSTMT,
+	TT_EOF,
+
+	TT_ERR=-1
 } Tokentype;
 
 typedef struct Token{
-	const char *str;
+	Tokentype type;
+	const char *str; //Part of another string; not null-terminated, and do not free
 	int len;
 } Token;
 
-Token nexttoken(const char **sourcep){
+
+static bool parsecomment(const char **sourcep){
+	const char *source=*sourcep;
+	if(*source!='#')return false;
+	if(source[1]=='#'&&source[2]=='#'){
+		source+=3;
+		while(*source&&
+			  (*source!='#'||source[1]!='#'||source[2]!='#')){
+			source++;
+		}
+		if(!*source)return false; //unclosed block comment
+		source+=2;
+	} else {
+		while(*source&&*source!='\n')source++;
+		if(*source)source++;
+	}
+	*sourcep=source;
+	return true;
+}
+
+static void skipintermediate(const char **sourcep){
+	const char *source=*sourcep;
+	bool acted;
+	do {
+		acted=false;
+		while(isspace(*source)){
+			source++;
+			acted=true;
+		}
+		if(parsecomment(&source)){
+			acted=true;
+		}
+	} while(acted);
+	*sourcep=source;
+}
+
+static Token nexttoken(const char **sourcep){
+	skipintermediate(sourcep);
 	const char *source=*sourcep;
-	while(isspace(*source))source++;
+	if(*source=='\0'){
+		Token tok={TT_EOF,NULL,-1};
+		return tok;
+	}
+	if(*source==';'){
+		Token tok={TT_ENDSTMT,source,1};
+		(*sourcep)++;
+		return tok;
+	}
 	if(isdigit(*source)||(*source=='-'&&isdigit(source[1]))){
 		char *endp;
 		strtod(source,&endp);
 		assert(endp!=source);
-		Token tok={source,endp-source};
+		Token tok={TT_NUM,source,endp-source};
+		*sourcep=endp;
 		return tok;
 	}
-}
-
-
-int precedence(const char *op){
-	switch(op[0]){
-		case '!': return op[1]=='\0' ? 12 : op[1]=='='&&op[2]=='\0' ?  5 : -1;
-		case '%': return op[1]=='\0' ? 11 : -1;
-		case '&': return op[1]=='\0' ?  9 : op[1]=='&'&&op[2]=='\0' ?  4 : -1;
-		case '*': return op[1]=='\0' ? 11 : op[1]=='*'&&op[2]=='\0' ? 14 : -1;
-		case '+': return op[1]=='\0' ? 10 : -1;
-		case '-': return op[1]=='\0' ? 10 : -1;
-		case '/': return op[1]=='\0' ? 11 : op[1]=='/'&&op[2]=='\0' ? 11 : -1;
-		case '<': return op[1]=='\0' ?  6 : op[1]=='='&&op[2]=='\0' ?  6 : -1;
-		case '=': return op[1]=='\0' ?  1 : op[1]=='='&&op[2]=='\0' ?  5 : -1;
-		case '>': return op[1]=='\0' ?  6 : op[1]=='='&&op[2]=='\0' ?  6 : -1;
-		case '^': return op[1]=='\0' ?  8 : op[1]=='^'&&op[2]=='\0' ?  3 : -1;
-		case '|': return op[1]=='\0' ?  7 : op[1]=='|'&&op[2]=='\0' ?  2 : -1;
-		case '~': return op[1]=='\0' ? 12 : -1;
-		default: return -1;
+	if(*source=='"'){
+		int i;
+		for(i=1;source[i]&&source[i]!='"';i++){
+			if(source[i]=='\\')i++;
+		}
+		if(!source[i]){
+			Token tok={TT_ERR,"Non-terminated string",21};
+			return tok;
+		}
+		*sourcep+=i+1;
+		Token tok={TT_STR,source,i+1};
+		return tok;
 	}
-}
-
-int associativity(const char *op){
-	switch(op[0]){
-		case '!': return op[1]=='\0' ? AS_PREFIX   : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '%': return op[1]=='\0' ? AS_LEFT     : -1;
-		case '&': return op[1]=='\0' ? AS_LEFT     : op[1]=='&'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '*': return op[1]=='\0' ? AS_LEFT     : op[1]=='*'&&op[2]=='\0' ? AS_RIGHT    : -1;
-		case '+': return op[1]=='\0' ? AS_LEFT     : -1;
-		case '-': return op[1]=='\0' ? AS_LEFT     : -1;
-		case '/': return op[1]=='\0' ? AS_LEFT     : op[1]=='/'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '<': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '=': return op[1]=='\0' ? AS_RIGHT    : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '>': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '^': return op[1]=='\0' ? AS_LEFT     : op[1]=='^'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '|': return op[1]=='\0' ? AS_LEFT     : op[1]=='|'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '~': return op[1]=='\0' ? AS_PREFIX   : -1;
-		default: return -1;
+	int oplen=parseoplength(source);
+	if(oplen!=-1){
+		Token tok={TT_OP,source,oplen};
+		*sourcep+=oplen;
+		return tok;
+	}
+	if(strchr("(){}",*source)!=NULL){
+		Token tok={TT_SYM,source,1};
+		(*sourcep)++;
+		return tok;
+	}
+	if(isalpha(*source)||*source=='_'){
+		int i;
+		for(i=1;source[i];i++){
+			if(!isalpha(source[i])&&!isdigit(source[i])&&source[i]!='_')break;
+		}
+		Token tok={TT_WORD,source,i};
+		*sourcep+=i;
+		return tok;
 	}
+	Token tok={TT_ERR,"Unrecognised token",18};
+	return tok;
 }
 
 
-static bool parsecomment(const char *source,int *reslen){
-	int cursor=0;
-	if(source[cursor]!='#')return false;
-	if(source[cursor+1]=='#'&&source[cursor+2]=='#'){
-		cursor+=3;
-		while(source[cursor]&&
-			  (source[cursor]!='#'||source[cursor+1]!='#'||source[cursor+2]!='#')){
-			cursor++;
-		}
-		if(!source[cursor])return false; //unclosed block comment
-		cursor+=2;
+static void printtoken(FILE *stream,Token tok,const char *msg){
+	const char *type;
+	switch(tok.type){
+		case TT_NUM: type="TT_NUM"; break;
+		case TT_STR: type="TT_STR"; break;
+		case TT_WORD: type="TT_WORD"; break;
+		case TT_OP: type="TT_OP"; break;
+		case TT_SYM: type="TT_SYM"; break;
+		case TT_ENDSTMT: type="TT_ENDSTMT"; break;
+		case TT_EOF: type="TT_EOF"; break;
+		case TT_ERR: type="TT_ERR"; break;
+		default: type="TT_(??\?)"; break; //TRIGRAPHS ._.
+	}
+	if(tok.len!=-1){
+		char buf[tok.len+1];
+		memcpy(buf,tok.str,tok.len);
+		buf[tok.len]='\0';
+		fprintf(stream,"(%s) Token: %s '%s'\n",msg,type,buf);
 	} else {
-		while(source[cursor]&&source[cursor]!='\n')cursor++;
-		if(source[cursor])cursor++;
+		fprintf(stream,"(%s) Token: %s (null)\n",msg,type);
 	}
-	*reslen=cursor;
-	return true;
 }
 
-static void parseintermediate(const char *source,int *reslen){
-	int cursor=0;
-	bool acted;
-	do {
-		acted=false;
-		while(source[cursor]&&isspace(source[cursor])){
-			cursor++;
-			acted=true;
+
+static AST* parseterm(const char *source,int *reslen){
+	const char *origsource=source;
+	const Token tok=nexttoken(&source);
+	printtoken(stderr,tok,"parseterm");
+	AST *node;
+	switch(tok.type){
+		case TT_NUM:{
+			node=malloc(sizeof(AST));
+			if(!node)outofmem();
+			node->type=AST_NUM;
+			char *endp;
+			int intv=strtol(tok.str,&endp,0);
+			node->n.isint=endp-tok.str==tok.len;
+			if(node->n.isint)node->n.i=intv;
+			else node->n.d=strtod(tok.str,NULL);
+			break;
 		}
-		int partlen;
-		if(parsecomment(source+cursor,&partlen)){
-			cursor+=partlen;
-			acted=true;
+
+		case TT_STR:{
+			int slen=0;
+			for(int i=1;i<tok.len-1;i++){
+				slen++;
+				if(tok.str[i]!='\\')continue;
+				i++;
+				if(tok.str[i]=='x'){
+					if(i+2>=tok.len-1||!ishexdigit(tok.str[i+1])||!ishexdigit(tok.str[i+2])){
+						return NULL;
+					}
+					i+=2;
+				} else {
+					i++;
+				}
+			}
+			node=malloc(sizeof(AST));
+			if(!node)outofmem();
+			node->type=AST_STR;
+			node->s.str=malloc(slen+1);
+			if(!node->s.str)outofmem();
+			int j=0;
+			for(int i=1;i<tok.len-1;i++){
+				if(tok.str[i]!='\\'){
+					node->s.str[j++]=tok.str[i];
+					continue;
+				}
+				i++;
+				switch(tok.str[i]){
+					case 'n': node->s.str[j++]='\n'; break;
+					case 'r': node->s.str[j++]='\r'; break;
+					case 't': node->s.str[j++]='\t'; break;
+					case 'b': node->s.str[j++]='\b'; break;
+					case 'a': node->s.str[j++]='\a'; break;
+					case 'x':
+						node->s.str[j++]=16*hexnumber(tok.str[i+1])+hexnumber(tok.str[i+2]);
+						i+=2;
+						break;
+					default:
+						node->s.str[j++]=tok.str[i];
+						break;
+				}
+			}
+			node->s.str[j]='\0';
+			break;
 		}
-	} while(acted);
-	*reslen=cursor;
+
+		case TT_WORD:{
+			if(tok.len==2&&memcmp(tok.str,"if",2)==0)assert(NOT_IMPLEMENTED);
+			if(tok.len==5&&memcmp(tok.str,"while",2)==0)assert(NOT_IMPLEMENTED);
+			const char *tempsource=source;
+			Token next=nexttoken(&source);
+			if(next.len==1&&next.str[0]=='(')assert(NOT_IMPLEMENTED);
+			source=tempsource;
+			node=malloc(sizeof(AST));
+			if(!node)outofmem();
+			node->type=AST_VAR;
+			node->v.name=malloc(tok.len+1);
+			if(!node->v.name)outofmem();
+			memcpy(node->v.name,tok.str,tok.len);
+			node->v.name[tok.len]='\0';
+			break;
+		}
+
+		case TT_SYM:
+			assert(NOT_IMPLEMENTED);
+			break;
+
+		case TT_OP:{
+			char buf[tok.len+3];
+			buf[0]='(';
+			memcpy(buf+1,tok.str,tok.len);
+			buf[tok.len+1]=')';
+			buf[tok.len+2]='\0';
+			if(associativity(buf)==AS_PREFIX){
+				node=malloc(sizeof(AST));
+				if(!node)outofmem();
+				node->type=AST_OP;
+				node->o.op=opconststring_len(buf,tok.len+2);
+				node->o.left=NULL;
+				int len;
+				node->o.right=parseterm(source,&len);
+				if(!node->o.right){
+					free(node);
+					return NULL;
+				}
+				source+=len;
+			} else return NULL;
+			break;
+		}
+
+		case TT_ENDSTMT:
+		case TT_EOF:
+		case TT_ERR:
+			return NULL;
+	}
+	*reslen=source-origsource;
+	return node;
 }
 
+//Uses precedence climbing
 static AST* parseexpr(const char *source,int *reslen,int minprec){
-	;
+	const char *origsource=source;
+	int len;
+	AST *tree=parseterm(source,&len);
+	if(!tree)return NULL;
+	source+=len;
+	while(true){
+		const char *beforeop=source;
+		Token tok=nexttoken(&source);
+		printtoken(stderr,tok,"parseEXPR");
+		if(tok.type==TT_ENDSTMT){
+			source=beforeop;
+			break;
+		}
+		if(tok.type!=TT_OP){
+			ast_free(tree);
+			return NULL;
+		}
+		int prec=precedence_len(tok.str,tok.len);
+		if(prec<minprec){
+			source=beforeop;
+			break;
+		}
+		Associativity assoc=associativity_len(tok.str,tok.len);
+		int q;
+		switch(assoc){
+			case AS_PREFIX: case AS_SUFFIX:
+				ast_free(tree);
+				return NULL;
+
+			case AS_LEFT: q=prec+1; break;
+			case AS_RIGHT: q=prec; break;
+			case AS_NONASSOC: q=prec+1; minprec=prec+1; break;
+
+			default: assert(false);
+		}
+		AST *right=parseexpr(source,&len,q);
+		if(!right){
+			ast_free(tree);
+			return NULL;
+		}
+		source+=len;
+		AST *opnode=malloc(sizeof(AST));
+		if(!opnode)outofmem();
+		opnode->type=AST_OP;
+		opnode->o.op=opconststring_len(tok.str,tok.len);
+		if(!opnode->o.op)outofmem();
+		opnode->o.left=tree;
+		opnode->o.right=right;
+		tree=opnode;
+	}
+	*reslen=source-origsource;
+	return tree;
 }
 
 static AST* parsestmt(const char *source,int *reslen){
 	return parseexpr(source,reslen,0);
 }
 
-ASTblock* parse(const char *source){
-	ASTblock *bl=malloc(sizeof(ASTblock));
+AST* parse(const char *source){
+	AST *bl=malloc(sizeof(AST));
+	if(!bl)outofmem();
+	bl->type=AST_BLOCK;
 	int sz=32;
-	bl->len=0;
-	bl->exprs=calloc(sz,sizeof(AST*));
-	if(!bl->exprs)outofmem();
+	bl->b.len=0;
+	bl->b.exprs=calloc(sz,sizeof(AST*));
+	if(!bl->b.exprs)outofmem();
 	int reslen;
 	int cursor=0;
 	while(true){
-		if(bl->len==sz){
+		if(bl->b.len==sz){
 			sz*=2;
-			bl->exprs=realloc(bl->exprs,sz*sizeof(AST*));
-			if(!bl->exprs)outofmem();
+			bl->b.exprs=realloc(bl->b.exprs,sz*sizeof(AST*));
+			if(!bl->b.exprs)outofmem();
 		}
-		parseintermediate(source+cursor,&reslen);
-		if(!source[cursor])break;
 		AST *node=parsestmt(source+cursor,&reslen);
 		if(!node){
-			ast_free((AST*)bl);
+			ast_free(bl);
 			return NULL;
 		}
-		bl->exprs[bl->len++]=node;
+		bl->b.exprs[bl->b.len++]=node;
 		cursor+=reslen;
+		const char *src=source+cursor;
+		Token tok=nexttoken(&src);
+		if(tok.type!=TT_ENDSTMT){
+			ast_free(bl);
+			return NULL;
+		}
+		cursor=src-source;
+		src=source+cursor;
+		tok=nexttoken(&src);
+		if(tok.type==TT_EOF)break;
 	}
 	return bl;
 }
 
-void ast_free(AST *ast_){
-	switch(ast_->type){
-		case AST_BLOCK:{ ASTblock *ast=(ASTblock*)ast_;
-			for(int i=0;i<ast->len;i++)if(ast->exprs[i])ast_free(ast->exprs[i]);
-			free(ast->exprs);
+static const char* charblock(char c,int n){
+	static char *buf=NULL;
+	if(!buf)buf=malloc(n+1);
+	else buf=realloc(buf,n+1);
+	if(!buf)outofmem();
+	memset(buf,c,n);
+	buf[n]='\0';
+	return buf;
+}
+
+#define TABW (4)
+#define INDENT fprintf(stream,"%s",charblock(' ',TABW*indent));
+static void ast_debug_(FILE *stream,const AST *ast,int indent){
+	switch(ast->type){
+		case AST_BLOCK:
+			if(ast->b.len==0){
+				fprintf(stream,"{}");
+				break;
+			}
+			fprintf(stream,"{\n");
+			indent++;
+			for(int i=0;i<ast->b.len;i++){
+				INDENT
+				ast_debug_(stream,ast->b.exprs[i],indent);
+				fputc('\n',stream);
+			}
+			indent--;
+			INDENT
+			fprintf(stream,"}");
+			break;
+
+		case AST_OP:{
+			bool leftp=ast->o.left&&ast->o.left->type==AST_OP&&precedence(ast->o.left->o.op)<=precedence(ast->o.op);
+			bool rightp=ast->o.right&&ast->o.right->type==AST_OP&&precedence(ast->o.right->o.op)<=precedence(ast->o.op);
+			//fprintf(stderr,"[[op='%s' p=%d lp=%d rp=%d]]",ast->o.op,precedence(ast->o.op),leftp,rightp);
+			if(leftp)fputc('(',stream);
+			if(ast->o.left)ast_debug_(stream,ast->o.left,indent);
+			fprintf(stream,"%s%s%s",leftp?")":"",ast->o.op,rightp?"(":"");
+			if(ast->o.right)ast_debug_(stream,ast->o.right,indent);
+			if(rightp)fputc(')',stream);
+			break;
+		}
+
+		case AST_NUM:
+			if(ast->n.isint)fprintf(stream,"%lld",ast->n.i);
+			else fprintf(stream,"%g",ast->n.d);
+			break;
+
+		case AST_STR:
+			fputc('"',stream);
+			for(int i=0;i<ast->s.len;i++){
+				if(ast->s.str[i]<32||ast->s.str[i]>126){
+					fprintf(stream,"\\x%c%c",hexencode(ast->s.str[i]/16),hexencode(ast->s.str[i]%16));
+				} else fputc(ast->s.str[i],stream);
+			}
+			fputc('"',stream);
+			break;
+
+		case AST_VAR:
+			fprintf(stream,"%s",ast->v.name);
+			break;
+
+		case AST_CALL:
+			fprintf(stream,"%s(",ast->c.func);
+			for(int i=0;i<ast->c.nargs;i++){
+				if(i!=0)fputc(',',stream);
+				ast_debug_(stream,ast->c.args[i],indent);
+			}
+			fputc(')',stream);
+			break;
+
+		case AST_IF:
+			assert(NOT_IMPLEMENTED);
+			break;
+
+		case AST_WHILE:
+			assert(NOT_IMPLEMENTED);
+			break;
+
+		default:
+			fprintf(stream,"AST_(??\?)");
+			break;
+	}
+}
+
+void ast_debug(FILE *stream,const AST *ast){
+	ast_debug_(stream,ast,0);
+	fputc('\n',stream);
+}
+
+void ast_free(AST *ast){
+	switch(ast->type){
+		case AST_BLOCK:{
+			for(int i=0;i<ast->b.len;i++)if(ast->b.exprs[i])ast_free(ast->b.exprs[i]);
+			free(ast->b.exprs);
 			break;
 		}
 
-		case AST_OP:{ ASTop *ast=(ASTop*)ast_;
-			if(ast->left)ast_free(ast->left);
-			if(ast->right)ast_free(ast->right);
+		case AST_OP:{
+			if(ast->o.left)ast_free(ast->o.left);
+			if(ast->o.right)ast_free(ast->o.right);
 			break;
 		}
 
 		case AST_NUM:
 			break;
 
-		case AST_STR:{ ASTstr *ast=(ASTstr*)ast_;
-			if(ast->str)free(ast->str);
+		case AST_STR:{
+			if(ast->s.str)free(ast->s.str);
 			break;
 		}
 
-		case AST_VAR:{ ASTvar *ast=(ASTvar*)ast_;
-			if(ast->name)free(ast->name);
+		case AST_VAR:{
+			if(ast->v.name)free(ast->v.name);
 			break;
 		}
 
-		case AST_CALL:{ ASTcall *ast=(ASTcall*)ast_;
-			if(ast->func)free(ast->func);
-			for(int i=0;i<ast->nargs;i++)if(ast->args[i])ast_free(ast->args[i]);
-			free(ast->args);
+		case AST_CALL:{
+			if(ast->c.func)free(ast->c.func);
+			for(int i=0;i<ast->c.nargs;i++)if(ast->c.args[i])ast_free(ast->c.args[i]);
+			free(ast->c.args);
 			break;
 		}
 
-		case AST_IF:{ ASTif *ast=(ASTif*)ast_;
-			if(ast->cond)free(ast->cond);
-			if(ast->thenb)free(ast->thenb);
-			if(ast->elseb)free(ast->elseb);
+		case AST_IF:{
+			if(ast->i.cond)free(ast->i.cond);
+			if(ast->i.thenb)free(ast->i.thenb);
+			if(ast->i.elseb)free(ast->i.elseb);
 			break;
 		}
 
-		case AST_WHILE:{ ASTwhile *ast=(ASTwhile*)ast_;
-			if(ast->cond)free(ast->cond);
-			if(ast->body)free(ast->body);
+		case AST_WHILE:{
+			if(ast->w.cond)free(ast->w.cond);
+			if(ast->w.body)free(ast->w.body);
 			break;
 		}
 	}
-	free(ast_);
+	free(ast);
 }
author	tomsmeding <tom.smeding@gmail.com>	2016-08-05 20:26:05 +0200
committer	tomsmeding <tom.smeding@gmail.com>	2016-08-06 10:10:32 +0200
commit	f67988fbfde6ad8a91466ef5d4227dcf9e5db6ce (patch)
tree	db85d3936f717331c3eeed4fae43e5ed43324be9 /parser.c
parent	e6bb770a52980ef3d85c2d4b93fb240c026ce7f7 (diff)