Working preliminary version of parser

author: tomsmeding <tom.smeding@gmail.com> 2016-08-05 20:26:05 +0200
committer: tomsmeding <tom.smeding@gmail.com> 2016-08-06 10:10:32 +0200
commit: f67988fbfde6ad8a91466ef5d4227dcf9e5db6ce (patch)
tree: db85d3936f717331c3eeed4fae43e5ed43324be9
parent: e6bb770a52980ef3d85c2d4b93fb240c026ce7f7 (diff)
9 files changed, 676 insertions, 138 deletions
diff --git a/LANGUAGE.txt b/LANGUAGE.txt
index 0e7066b..3259182 100644
--- a/LANGUAGE.txt
+++ b/LANGUAGE.txt
@@ -2,20 +2,23 @@ Statements are terminated by ';'.
 The usual infix expression rules apply, with the following precedence table:
 (higher precedence number means tighter binding)
 
- Operators   Precedence   Associativity
-    **           14           Right
-    - ! ~        12          Prefix (unary)
- * / // %        11           Left
-    + -          10           Left
-    &             9           Left
-    ^             8           Left
-    |             7           Left
- < > <= >=        6       Nonassociative
-   == !=          5       Nonassociative
-    &&            4           Left (short-circuiting)
-    ^^            3           Left
-    ||            2           Left (short-circuiting)
-    =             1           Right  (also += -= *= /= %= **= &= ^= |=)
+ Operators     Precedence   Associativity
+ = += -= *=         1           Right
+ /= //= %= **=      1           Right
+  &= ^= |=          1           Right
+    ||              2           Left (short-circuiting)
+    ^^              3           Left
+    &&              4           Left (short-circuiting)
+   == !=            5       Nonassociative
+ < > <= >=          6       Nonassociative
+    |               7           Left
+    ^               8           Left
+    &               9           Left
+    + -            10           Left
+ * / // %          11           Left
+  (-) ! ~          12          Prefix (unary)
+    (!)            13          Suffix (unary)
+    **             14           Right
 
 
 break and continue get parsed to calls to the __break() and __continue()
diff --git a/Makefile b/Makefile
index 55b2ef8..68912f8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 CC = gcc
-CFLAGS = -Wall -Wextra -std=c11 -O2 -fwrapv
+CFLAGS = -Wall -Wextra -std=c11 -g -fwrapv
 BIN = main
 
 .PHONY: all clean remake
@@ -7,7 +7,8 @@ BIN = main
 all: $(BIN)
 
 clean:
-	rm -f $(BIN) *.o *.dSYM
+	rm -f $(BIN) *.o
+	rm -rf *.dSYM
 
 remake: clean all
 
diff --git a/code.txt b/code.txt
index 471fb1b..3b04c0f 100644
--- a/code.txt
+++ b/code.txt
@@ -1,2 +1,3 @@
 a = 1;
 b = 2;
+c = 1 + -x - 3 > -1;
diff --git a/genops.js b/genops.js
index 33b688d..9b88cbc 100755
--- a/genops.js
+++ b/genops.js
@@ -3,7 +3,6 @@ const fs=require("fs");
 
 
 function print(/*arguments*/){
-	//console.log.apply(console,arguments);
 	process.stdout.write.apply(process.stdout,arguments);
 }
 
@@ -45,8 +44,12 @@ function readopmap(fname){
 	return opmap;
 }
 
-function outputfunc(opmap,name,gen,padw){
-	print("int "+name+"(const char *op){\n");
+function outputfunc(_ /*opmap,name,gen,padw,dolen,rettype,defval,checkend*/){
+	let opmap=_.opmap, name=_.name, gen=_.gen,
+	    padw=_.padw, dolen=_.dolen, rettype=_.rettype,
+	    defval=_.defval, checkend=_.checkend;
+	print("\n"+rettype+" "+name+"(const char *op"+(dolen?",const int len":"")+"){\n");
+	if(dolen)print("\tif(len<=0)return "+defval+";\n");
 	print("\tswitch(op[0]){\n");
 
 	let firstchars={};
@@ -56,33 +59,98 @@ function outputfunc(opmap,name,gen,padw){
 	}
 	let arr=[];
 	for(k in firstchars){
-		arr.push([k,firstchars[k].sort()]);
+		arr.push([k,firstchars[k].sort(function(a,b){
+			return b.length-a.length;
+		})]);
 	}
 	arr=arr.sort();
 	for(let tup of arr){
 		let k=tup[0],ops=tup[1];
 		print("\t\tcase '"+k+"': return ");
+		let expr="";
 		for(let op of ops){
-			let cond="";
-			for(j=1;j<op.length;j++)cond+="op["+j+"]=='"+op[j]+"'&&";
-			cond+="op["+j+"]=='\\0'";
-			print(cond+" ? "+pad(gen(opmap[op]),padw)+" : ");
+			let cond;
+			if(dolen){
+				cond="len=="+op.length;
+				for(j=1;j<op.length;j++)cond+="&&op["+j+"]=='"+op[j]+"'";
+			} else {
+				cond="";
+				for(j=1;j<op.length;j++)cond+="op["+j+"]=='"+op[j]+"'&&";
+				if(checkend)cond+="!op["+j+"]";
+				else cond=cond.slice(0,-2);
+				if(cond=="")cond="true";
+			}
+			expr+=cond+"?"+pad(gen(op,opmap[op]),padw)+":";
 		}
-		print("-1;\n");
+		expr+=defval;
+		expr=expr.replace(/true\?([^:]*):[^)]*/,"$1");
+		print(expr+";\n");
 	}
 
-	print("\t\tdefault: return -1;\n");
+	print("\t\tdefault: return "+defval+";\n");
 	print("\t}\n");
 	print("}\n");
 }
 
 
 const opmap=readopmap("LANGUAGE.txt");
-outputfunc(opmap,"precedence",o=>o.prec,2);
-print("\n");
 const assocenum={
 	"Prefix": "AS_PREFIX", "Suffix": "AS_SUFFIX",
 	"Left": "AS_LEFT", "Right": "AS_RIGHT",
 	"Nonassociative": "AS_NONASSOC"
 };
-outputfunc(opmap,"associativity",o=>assocenum[o.assoc],-11);
+
+print("#include <stddef.h>\n\n");
+print("#include \"opfuncs.h\"\n");
+print("#include \"parser.h\"\n");
+
+function alsolen(_){
+	outputfunc(_);
+	_.name+="_len";
+	_.dolen=true;
+	outputfunc(_);
+}
+
+alsolen({
+	opmap:opmap,
+	name:"precedence",
+	gen:(op,o)=>o.prec,
+	padw:2,
+	dolen:false,
+	rettype:"int",
+	defval:"-1",
+	checkend:true
+});
+
+alsolen({
+	opmap:opmap,
+	name:"associativity",
+	gen:(op,o)=>assocenum[o.assoc],
+	padw:-11,
+	dolen:false,
+	rettype:"int",
+	defval:"-1",
+	checkend:true
+});
+
+outputfunc({
+	opmap:opmap,
+	name:"parseoplength",
+	gen:(op,o)=>op.length,
+	padw:1,
+	dolen:false,
+	rettype:"int",
+	defval:"-1",
+	checkend:false
+});
+
+outputfunc({
+	opmap:opmap,
+	name:"opconststring_len",
+	gen:(op,o)=>'"'+op+'"',
+	padw:3,
+	dolen:true,
+	rettype:"const char*",
+	defval:"NULL",
+	checkend:true
+});
diff --git a/main.c b/main.c
index 6945950..b2f31dd 100644
--- a/main.c
+++ b/main.c
@@ -77,5 +77,11 @@ int main(int argc,char **argv){
 		return 1;
 	}
 
-	;
+	AST *ast=parse(source);
+	if(ast==NULL){
+		fprintf(stderr,"Parsing error!\n");
+		return 1;
+	}
+	ast_debug(stderr,ast);
+	ast_free(ast);
 }
diff --git a/opfuncs.c b/opfuncs.c
new file mode 100644
index 0000000..a5607ae
--- /dev/null
+++ b/opfuncs.c
@@ -0,0 +1,127 @@
+#include <stddef.h>
+
+#include "opfuncs.h"
+#include "parser.h"
+
+int precedence(const char *op){
+	switch(op[0]){
+		case '!': return op[1]=='='&&!op[2]? 5:!op[1]?12:-1;
+		case '%': return op[1]=='='&&!op[2]? 1:!op[1]?11:-1;
+		case '&': return op[1]=='='&&!op[2]? 1:op[1]=='&'&&!op[2]? 4:!op[1]? 9:-1;
+		case '(': return op[1]=='-'&&op[2]==')'&&!op[3]?12:op[1]=='!'&&op[2]==')'&&!op[3]?13:-1;
+		case '*': return op[1]=='*'&&op[2]=='='&&!op[3]? 1:op[1]=='='&&!op[2]? 1:op[1]=='*'&&!op[2]?14:!op[1]?11:-1;
+		case '+': return op[1]=='='&&!op[2]? 1:!op[1]?10:-1;
+		case '-': return op[1]=='='&&!op[2]? 1:!op[1]?10:-1;
+		case '/': return op[1]=='/'&&op[2]=='='&&!op[3]? 1:op[1]=='='&&!op[2]? 1:op[1]=='/'&&!op[2]?11:!op[1]?11:-1;
+		case '<': return op[1]=='='&&!op[2]? 6:!op[1]? 6:-1;
+		case '=': return op[1]=='='&&!op[2]? 5:!op[1]? 1:-1;
+		case '>': return op[1]=='='&&!op[2]? 6:!op[1]? 6:-1;
+		case '^': return op[1]=='='&&!op[2]? 1:op[1]=='^'&&!op[2]? 3:!op[1]? 8:-1;
+		case '|': return op[1]=='='&&!op[2]? 1:op[1]=='|'&&!op[2]? 2:!op[1]? 7:-1;
+		case '~': return !op[1]?12:-1;
+		default: return -1;
+	}
+}
+
+int precedence_len(const char *op,const int len){
+	if(len<=0)return -1;
+	switch(op[0]){
+		case '!': return len==2&&op[1]=='='? 5:len==1?12:-1;
+		case '%': return len==2&&op[1]=='='? 1:len==1?11:-1;
+		case '&': return len==2&&op[1]=='='? 1:len==2&&op[1]=='&'? 4:len==1? 9:-1;
+		case '(': return len==3&&op[1]=='-'&&op[2]==')'?12:len==3&&op[1]=='!'&&op[2]==')'?13:-1;
+		case '*': return len==3&&op[1]=='*'&&op[2]=='='? 1:len==2&&op[1]=='='? 1:len==2&&op[1]=='*'?14:len==1?11:-1;
+		case '+': return len==2&&op[1]=='='? 1:len==1?10:-1;
+		case '-': return len==2&&op[1]=='='? 1:len==1?10:-1;
+		case '/': return len==3&&op[1]=='/'&&op[2]=='='? 1:len==2&&op[1]=='='? 1:len==2&&op[1]=='/'?11:len==1?11:-1;
+		case '<': return len==2&&op[1]=='='? 6:len==1? 6:-1;
+		case '=': return len==2&&op[1]=='='? 5:len==1? 1:-1;
+		case '>': return len==2&&op[1]=='='? 6:len==1? 6:-1;
+		case '^': return len==2&&op[1]=='='? 1:len==2&&op[1]=='^'? 3:len==1? 8:-1;
+		case '|': return len==2&&op[1]=='='? 1:len==2&&op[1]=='|'? 2:len==1? 7:-1;
+		case '~': return len==1?12:-1;
+		default: return -1;
+	}
+}
+
+int associativity(const char *op){
+	switch(op[0]){
+		case '!': return op[1]=='='&&!op[2]?AS_NONASSOC:!op[1]?AS_PREFIX  :-1;
+		case '%': return op[1]=='='&&!op[2]?AS_RIGHT   :!op[1]?AS_LEFT    :-1;
+		case '&': return op[1]=='='&&!op[2]?AS_RIGHT   :op[1]=='&'&&!op[2]?AS_LEFT    :!op[1]?AS_LEFT    :-1;
+		case '(': return op[1]=='-'&&op[2]==')'&&!op[3]?AS_PREFIX  :op[1]=='!'&&op[2]==')'&&!op[3]?AS_SUFFIX  :-1;
+		case '*': return op[1]=='*'&&op[2]=='='&&!op[3]?AS_RIGHT   :op[1]=='='&&!op[2]?AS_RIGHT   :op[1]=='*'&&!op[2]?AS_RIGHT   :!op[1]?AS_LEFT    :-1;
+		case '+': return op[1]=='='&&!op[2]?AS_RIGHT   :!op[1]?AS_LEFT    :-1;
+		case '-': return op[1]=='='&&!op[2]?AS_RIGHT   :!op[1]?AS_LEFT    :-1;
+		case '/': return op[1]=='/'&&op[2]=='='&&!op[3]?AS_RIGHT   :op[1]=='='&&!op[2]?AS_RIGHT   :op[1]=='/'&&!op[2]?AS_LEFT    :!op[1]?AS_LEFT    :-1;
+		case '<': return op[1]=='='&&!op[2]?AS_NONASSOC:!op[1]?AS_NONASSOC:-1;
+		case '=': return op[1]=='='&&!op[2]?AS_NONASSOC:!op[1]?AS_RIGHT   :-1;
+		case '>': return op[1]=='='&&!op[2]?AS_NONASSOC:!op[1]?AS_NONASSOC:-1;
+		case '^': return op[1]=='='&&!op[2]?AS_RIGHT   :op[1]=='^'&&!op[2]?AS_LEFT    :!op[1]?AS_LEFT    :-1;
+		case '|': return op[1]=='='&&!op[2]?AS_RIGHT   :op[1]=='|'&&!op[2]?AS_LEFT    :!op[1]?AS_LEFT    :-1;
+		case '~': return !op[1]?AS_PREFIX  :-1;
+		default: return -1;
+	}
+}
+
+int associativity_len(const char *op,const int len){
+	if(len<=0)return -1;
+	switch(op[0]){
+		case '!': return len==2&&op[1]=='='?AS_NONASSOC:len==1?AS_PREFIX  :-1;
+		case '%': return len==2&&op[1]=='='?AS_RIGHT   :len==1?AS_LEFT    :-1;
+		case '&': return len==2&&op[1]=='='?AS_RIGHT   :len==2&&op[1]=='&'?AS_LEFT    :len==1?AS_LEFT    :-1;
+		case '(': return len==3&&op[1]=='-'&&op[2]==')'?AS_PREFIX  :len==3&&op[1]=='!'&&op[2]==')'?AS_SUFFIX  :-1;
+		case '*': return len==3&&op[1]=='*'&&op[2]=='='?AS_RIGHT   :len==2&&op[1]=='='?AS_RIGHT   :len==2&&op[1]=='*'?AS_RIGHT   :len==1?AS_LEFT    :-1;
+		case '+': return len==2&&op[1]=='='?AS_RIGHT   :len==1?AS_LEFT    :-1;
+		case '-': return len==2&&op[1]=='='?AS_RIGHT   :len==1?AS_LEFT    :-1;
+		case '/': return len==3&&op[1]=='/'&&op[2]=='='?AS_RIGHT   :len==2&&op[1]=='='?AS_RIGHT   :len==2&&op[1]=='/'?AS_LEFT    :len==1?AS_LEFT    :-1;
+		case '<': return len==2&&op[1]=='='?AS_NONASSOC:len==1?AS_NONASSOC:-1;
+		case '=': return len==2&&op[1]=='='?AS_NONASSOC:len==1?AS_RIGHT   :-1;
+		case '>': return len==2&&op[1]=='='?AS_NONASSOC:len==1?AS_NONASSOC:-1;
+		case '^': return len==2&&op[1]=='='?AS_RIGHT   :len==2&&op[1]=='^'?AS_LEFT    :len==1?AS_LEFT    :-1;
+		case '|': return len==2&&op[1]=='='?AS_RIGHT   :len==2&&op[1]=='|'?AS_LEFT    :len==1?AS_LEFT    :-1;
+		case '~': return len==1?AS_PREFIX  :-1;
+		default: return -1;
+	}
+}
+
+int parseoplength(const char *op){
+	switch(op[0]){
+		case '!': return op[1]=='='?2:1;
+		case '%': return op[1]=='='?2:1;
+		case '&': return op[1]=='='?2:op[1]=='&'?2:1;
+		case '(': return op[1]=='-'&&op[2]==')'?3:op[1]=='!'&&op[2]==')'?3:-1;
+		case '*': return op[1]=='*'&&op[2]=='='?3:op[1]=='='?2:op[1]=='*'?2:1;
+		case '+': return op[1]=='='?2:1;
+		case '-': return op[1]=='='?2:1;
+		case '/': return op[1]=='/'&&op[2]=='='?3:op[1]=='='?2:op[1]=='/'?2:1;
+		case '<': return op[1]=='='?2:1;
+		case '=': return op[1]=='='?2:1;
+		case '>': return op[1]=='='?2:1;
+		case '^': return op[1]=='='?2:op[1]=='^'?2:1;
+		case '|': return op[1]=='='?2:op[1]=='|'?2:1;
+		case '~': return 1;
+		default: return -1;
+	}
+}
+
+const char* opconststring_len(const char *op,const int len){
+	if(len<=0)return NULL;
+	switch(op[0]){
+		case '!': return len==2&&op[1]=='='?"!=":len==1?"!":NULL;
+		case '%': return len==2&&op[1]=='='?"%=":len==1?"%":NULL;
+		case '&': return len==2&&op[1]=='='?"&=":len==2&&op[1]=='&'?"&&":len==1?"&":NULL;
+		case '(': return len==3&&op[1]=='-'&&op[2]==')'?"(-)":len==3&&op[1]=='!'&&op[2]==')'?"(!)":NULL;
+		case '*': return len==3&&op[1]=='*'&&op[2]=='='?"**=":len==2&&op[1]=='='?"*=":len==2&&op[1]=='*'?"**":len==1?"*":NULL;
+		case '+': return len==2&&op[1]=='='?"+=":len==1?"+":NULL;
+		case '-': return len==2&&op[1]=='='?"-=":len==1?"-":NULL;
+		case '/': return len==3&&op[1]=='/'&&op[2]=='='?"//=":len==2&&op[1]=='='?"/=":len==2&&op[1]=='/'?"//":len==1?"/":NULL;
+		case '<': return len==2&&op[1]=='='?"<=":len==1?"<":NULL;
+		case '=': return len==2&&op[1]=='='?"==":len==1?"=":NULL;
+		case '>': return len==2&&op[1]=='='?">=":len==1?">":NULL;
+		case '^': return len==2&&op[1]=='='?"^=":len==2&&op[1]=='^'?"^^":len==1?"^":NULL;
+		case '|': return len==2&&op[1]=='='?"|=":len==2&&op[1]=='|'?"||":len==1?"|":NULL;
+		case '~': return len==1?"~":NULL;
+		default: return NULL;
+	}
+}
diff --git a/opfuncs.h b/opfuncs.h
new file mode 100644
index 0000000..a5e212c
--- /dev/null
+++ b/opfuncs.h
@@ -0,0 +1,8 @@
+#pragma once
+
+int precedence(const char *op);
+int precedence_len(const char *op,const int len);
+int associativity(const char *op);
+int associativity_len(const char *op,const int len);
+int parseoplength(const char *op);
+const char* opconststring_len(const char *op,const int len);
diff --git a/parser.c b/parser.c
index 96b9bf7..14fd47d 100644
--- a/parser.c
+++ b/parser.c
@@ -2,193 +2,515 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <assert.h>
 
 #include "memory.h"
+#include "opfuncs.h"
 #include "parser.h"
 
 
+#define NOT_IMPLEMENTED false
+
+
+static bool ishexdigit(char c){
+	return (c>='0'&&c<='9')||(c>='a'&&c<='f')||(c>='A'&&c<='F');
+}
+
+static int hexnumber(char c){
+	return c<='9'?c-'0':(c&~('a'-'A'))-'A'+10;
+}
+
+static char hexencode(int n){
+	return n<10?n+'0':n+'a';
+}
+
+
 typedef enum Tokentype{
 	TT_NUM,
 	TT_STR,
 	TT_WORD,
-	TT_SYM
+	TT_OP,
+	TT_SYM, //all symbols that are not operators
+	TT_ENDSTMT,
+	TT_EOF,
+
+	TT_ERR=-1
 } Tokentype;
 
 typedef struct Token{
-	const char *str;
+	Tokentype type;
+	const char *str; //Part of another string; not null-terminated, and do not free
 	int len;
 } Token;
 
-Token nexttoken(const char **sourcep){
+
+static bool parsecomment(const char **sourcep){
+	const char *source=*sourcep;
+	if(*source!='#')return false;
+	if(source[1]=='#'&&source[2]=='#'){
+		source+=3;
+		while(*source&&
+			  (*source!='#'||source[1]!='#'||source[2]!='#')){
+			source++;
+		}
+		if(!*source)return false; //unclosed block comment
+		source+=2;
+	} else {
+		while(*source&&*source!='\n')source++;
+		if(*source)source++;
+	}
+	*sourcep=source;
+	return true;
+}
+
+static void skipintermediate(const char **sourcep){
+	const char *source=*sourcep;
+	bool acted;
+	do {
+		acted=false;
+		while(isspace(*source)){
+			source++;
+			acted=true;
+		}
+		if(parsecomment(&source)){
+			acted=true;
+		}
+	} while(acted);
+	*sourcep=source;
+}
+
+static Token nexttoken(const char **sourcep){
+	skipintermediate(sourcep);
 	const char *source=*sourcep;
-	while(isspace(*source))source++;
+	if(*source=='\0'){
+		Token tok={TT_EOF,NULL,-1};
+		return tok;
+	}
+	if(*source==';'){
+		Token tok={TT_ENDSTMT,source,1};
+		(*sourcep)++;
+		return tok;
+	}
 	if(isdigit(*source)||(*source=='-'&&isdigit(source[1]))){
 		char *endp;
 		strtod(source,&endp);
 		assert(endp!=source);
-		Token tok={source,endp-source};
+		Token tok={TT_NUM,source,endp-source};
+		*sourcep=endp;
+		return tok;
+	}
+	if(*source=='"'){
+		int i;
+		for(i=1;source[i]&&source[i]!='"';i++){
+			if(source[i]=='\\')i++;
+		}
+		if(!source[i]){
+			Token tok={TT_ERR,"Non-terminated string",21};
+			return tok;
+		}
+		*sourcep+=i+1;
+		Token tok={TT_STR,source,i+1};
+		return tok;
+	}
+	int oplen=parseoplength(source);
+	if(oplen!=-1){
+		Token tok={TT_OP,source,oplen};
+		*sourcep+=oplen;
+		return tok;
+	}
+	if(strchr("(){}",*source)!=NULL){
+		Token tok={TT_SYM,source,1};
+		(*sourcep)++;
+		return tok;
+	}
+	if(isalpha(*source)||*source=='_'){
+		int i;
+		for(i=1;source[i];i++){
+			if(!isalpha(source[i])&&!isdigit(source[i])&&source[i]!='_')break;
+		}
+		Token tok={TT_WORD,source,i};
+		*sourcep+=i;
 		return tok;
 	}
+	Token tok={TT_ERR,"Unrecognised token",18};
+	return tok;
 }
 
 
-int precedence(const char *op){
-	switch(op[0]){
-		case '!': return op[1]=='\0' ? 12 : op[1]=='='&&op[2]=='\0' ?  5 : -1;
-		case '%': return op[1]=='\0' ? 11 : -1;
-		case '&': return op[1]=='\0' ?  9 : op[1]=='&'&&op[2]=='\0' ?  4 : -1;
-		case '*': return op[1]=='\0' ? 11 : op[1]=='*'&&op[2]=='\0' ? 14 : -1;
-		case '+': return op[1]=='\0' ? 10 : -1;
-		case '-': return op[1]=='\0' ? 10 : -1;
-		case '/': return op[1]=='\0' ? 11 : op[1]=='/'&&op[2]=='\0' ? 11 : -1;
-		case '<': return op[1]=='\0' ?  6 : op[1]=='='&&op[2]=='\0' ?  6 : -1;
-		case '=': return op[1]=='\0' ?  1 : op[1]=='='&&op[2]=='\0' ?  5 : -1;
-		case '>': return op[1]=='\0' ?  6 : op[1]=='='&&op[2]=='\0' ?  6 : -1;
-		case '^': return op[1]=='\0' ?  8 : op[1]=='^'&&op[2]=='\0' ?  3 : -1;
-		case '|': return op[1]=='\0' ?  7 : op[1]=='|'&&op[2]=='\0' ?  2 : -1;
-		case '~': return op[1]=='\0' ? 12 : -1;
-		default: return -1;
+static void printtoken(FILE *stream,Token tok,const char *msg){
+	const char *type;
+	switch(tok.type){
+		case TT_NUM: type="TT_NUM"; break;
+		case TT_STR: type="TT_STR"; break;
+		case TT_WORD: type="TT_WORD"; break;
+		case TT_OP: type="TT_OP"; break;
+		case TT_SYM: type="TT_SYM"; break;
+		case TT_ENDSTMT: type="TT_ENDSTMT"; break;
+		case TT_EOF: type="TT_EOF"; break;
+		case TT_ERR: type="TT_ERR"; break;
+		default: type="TT_(??\?)"; break; //TRIGRAPHS ._.
 	}
-}
-
-int associativity(const char *op){
-	switch(op[0]){
-		case '!': return op[1]=='\0' ? AS_PREFIX   : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '%': return op[1]=='\0' ? AS_LEFT     : -1;
-		case '&': return op[1]=='\0' ? AS_LEFT     : op[1]=='&'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '*': return op[1]=='\0' ? AS_LEFT     : op[1]=='*'&&op[2]=='\0' ? AS_RIGHT    : -1;
-		case '+': return op[1]=='\0' ? AS_LEFT     : -1;
-		case '-': return op[1]=='\0' ? AS_LEFT     : -1;
-		case '/': return op[1]=='\0' ? AS_LEFT     : op[1]=='/'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '<': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '=': return op[1]=='\0' ? AS_RIGHT    : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '>': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
-		case '^': return op[1]=='\0' ? AS_LEFT     : op[1]=='^'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '|': return op[1]=='\0' ? AS_LEFT     : op[1]=='|'&&op[2]=='\0' ? AS_LEFT     : -1;
-		case '~': return op[1]=='\0' ? AS_PREFIX   : -1;
-		default: return -1;
+	if(tok.len!=-1){
+		char buf[tok.len+1];
+		memcpy(buf,tok.str,tok.len);
+		buf[tok.len]='\0';
+		fprintf(stream,"(%s) Token: %s '%s'\n",msg,type,buf);
+	} else {
+		fprintf(stream,"(%s) Token: %s (null)\n",msg,type);
 	}
 }
 
 
-static bool parsecomment(const char *source,int *reslen){
-	int cursor=0;
-	if(source[cursor]!='#')return false;
-	if(source[cursor+1]=='#'&&source[cursor+2]=='#'){
-		cursor+=3;
-		while(source[cursor]&&
-			  (source[cursor]!='#'||source[cursor+1]!='#'||source[cursor+2]!='#')){
-			cursor++;
+static AST* parseterm(const char *source,int *reslen){
+	const char *origsource=source;
+	const Token tok=nexttoken(&source);
+	printtoken(stderr,tok,"parseterm");
+	AST *node;
+	switch(tok.type){
+		case TT_NUM:{
+			node=malloc(sizeof(AST));
+			if(!node)outofmem();
+			node->type=AST_NUM;
+			char *endp;
+			int intv=strtol(tok.str,&endp,0);
+			node->n.isint=endp-tok.str==tok.len;
+			if(node->n.isint)node->n.i=intv;
+			else node->n.d=strtod(tok.str,NULL);
+			break;
 		}
-		if(!source[cursor])return false; //unclosed block comment
-		cursor+=2;
-	} else {
-		while(source[cursor]&&source[cursor]!='\n')cursor++;
-		if(source[cursor])cursor++;
-	}
-	*reslen=cursor;
-	return true;
-}
 
-static void parseintermediate(const char *source,int *reslen){
-	int cursor=0;
-	bool acted;
-	do {
-		acted=false;
-		while(source[cursor]&&isspace(source[cursor])){
-			cursor++;
-			acted=true;
+		case TT_STR:{
+			int slen=0;
+			for(int i=1;i<tok.len-1;i++){
+				slen++;
+				if(tok.str[i]!='\\')continue;
+				i++;
+				if(tok.str[i]=='x'){
+					if(i+2>=tok.len-1||!ishexdigit(tok.str[i+1])||!ishexdigit(tok.str[i+2])){
+						return NULL;
+					}
+					i+=2;
+				} else {
+					i++;
+				}
+			}
+			node=malloc(sizeof(AST));
+			if(!node)outofmem();
+			node->type=AST_STR;
+			node->s.str=malloc(slen+1);
+			if(!node->s.str)outofmem();
+			int j=0;
+			for(int i=1;i<tok.len-1;i++){
+				if(tok.str[i]!='\\'){
+					node->s.str[j++]=tok.str[i];
+					continue;
+				}
+				i++;
+				switch(tok.str[i]){
+					case 'n': node->s.str[j++]='\n'; break;
+					case 'r': node->s.str[j++]='\r'; break;
+					case 't': node->s.str[j++]='\t'; break;
+					case 'b': node->s.str[j++]='\b'; break;
+					case 'a': node->s.str[j++]='\a'; break;
+					case 'x':
+						node->s.str[j++]=16*hexnumber(tok.str[i+1])+hexnumber(tok.str[i+2]);
+						i+=2;
+						break;
+					default:
+						node->s.str[j++]=tok.str[i];
+						break;
+				}
+			}
+			node->s.str[j]='\0';
+			break;
 		}
-		int partlen;
-		if(parsecomment(source+cursor,&partlen)){
-			cursor+=partlen;
-			acted=true;
+
+		case TT_WORD:{
+			if(tok.len==2&&memcmp(tok.str,"if",2)==0)assert(NOT_IMPLEMENTED);
+			if(tok.len==5&&memcmp(tok.str,"while",2)==0)assert(NOT_IMPLEMENTED);
+			const char *tempsource=source;
+			Token next=nexttoken(&source);
+			if(next.len==1&&next.str[0]=='(')assert(NOT_IMPLEMENTED);
+			source=tempsource;
+			node=malloc(sizeof(AST));
+			if(!node)outofmem();
+			node->type=AST_VAR;
+			node->v.name=malloc(tok.len+1);
+			if(!node->v.name)outofmem();
+			memcpy(node->v.name,tok.str,tok.len);
+			node->v.name[tok.len]='\0';
+			break;
 		}
-	} while(acted);
-	*reslen=cursor;
+
+		case TT_SYM:
+			assert(NOT_IMPLEMENTED);
+			break;
+
+		case TT_OP:{
+			char buf[tok.len+3];
+			buf[0]='(';
+			memcpy(buf+1,tok.str,tok.len);
+			buf[tok.len+1]=')';
+			buf[tok.len+2]='\0';
+			if(associativity(buf)==AS_PREFIX){
+				node=malloc(sizeof(AST));
+				if(!node)outofmem();
+				node->type=AST_OP;
+				node->o.op=opconststring_len(buf,tok.len+2);
+				node->o.left=NULL;
+				int len;
+				node->o.right=parseterm(source,&len);
+				if(!node->o.right){
+					free(node);
+					return NULL;
+				}
+				source+=len;
+			} else return NULL;
+			break;
+		}
+
+		case TT_ENDSTMT:
+		case TT_EOF:
+		case TT_ERR:
+			return NULL;
+	}
+	*reslen=source-origsource;
+	return node;
 }
 
+//Uses precedence climbing
 static AST* parseexpr(const char *source,int *reslen,int minprec){
-	;
+	const char *origsource=source;
+	int len;
+	AST *tree=parseterm(source,&len);
+	if(!tree)return NULL;
+	source+=len;
+	while(true){
+		const char *beforeop=source;
+		Token tok=nexttoken(&source);
+		printtoken(stderr,tok,"parseEXPR");
+		if(tok.type==TT_ENDSTMT){
+			source=beforeop;
+			break;
+		}
+		if(tok.type!=TT_OP){
+			ast_free(tree);
+			return NULL;
+		}
+		int prec=precedence_len(tok.str,tok.len);
+		if(prec<minprec){
+			source=beforeop;
+			break;
+		}
+		Associativity assoc=associativity_len(tok.str,tok.len);
+		int q;
+		switch(assoc){
+			case AS_PREFIX: case AS_SUFFIX:
+				ast_free(tree);
+				return NULL;
+
+			case AS_LEFT: q=prec+1; break;
+			case AS_RIGHT: q=prec; break;
+			case AS_NONASSOC: q=prec+1; minprec=prec+1; break;
+
+			default: assert(false);
+		}
+		AST *right=parseexpr(source,&len,q);
+		if(!right){
+			ast_free(tree);
+			return NULL;
+		}
+		source+=len;
+		AST *opnode=malloc(sizeof(AST));
+		if(!opnode)outofmem();
+		opnode->type=AST_OP;
+		opnode->o.op=opconststring_len(tok.str,tok.len);
+		if(!opnode->o.op)outofmem();
+		opnode->o.left=tree;
+		opnode->o.right=right;
+		tree=opnode;
+	}
+	*reslen=source-origsource;
+	return tree;
 }
 
 static AST* parsestmt(const char *source,int *reslen){
 	return parseexpr(source,reslen,0);
 }
 
-ASTblock* parse(const char *source){
-	ASTblock *bl=malloc(sizeof(ASTblock));
+AST* parse(const char *source){
+	AST *bl=malloc(sizeof(AST));
+	if(!bl)outofmem();
+	bl->type=AST_BLOCK;
 	int sz=32;
-	bl->len=0;
-	bl->exprs=calloc(sz,sizeof(AST*));
-	if(!bl->exprs)outofmem();
+	bl->b.len=0;
+	bl->b.exprs=calloc(sz,sizeof(AST*));
+	if(!bl->b.exprs)outofmem();
 	int reslen;
 	int cursor=0;
 	while(true){
-		if(bl->len==sz){
+		if(bl->b.len==sz){
 			sz*=2;
-			bl->exprs=realloc(bl->exprs,sz*sizeof(AST*));
-			if(!bl->exprs)outofmem();
+			bl->b.exprs=realloc(bl->b.exprs,sz*sizeof(AST*));
+			if(!bl->b.exprs)outofmem();
 		}
-		parseintermediate(source+cursor,&reslen);
-		if(!source[cursor])break;
 		AST *node=parsestmt(source+cursor,&reslen);
 		if(!node){
-			ast_free((AST*)bl);
+			ast_free(bl);
 			return NULL;
 		}
-		bl->exprs[bl->len++]=node;
+		bl->b.exprs[bl->b.len++]=node;
 		cursor+=reslen;
+		const char *src=source+cursor;
+		Token tok=nexttoken(&src);
+		if(tok.type!=TT_ENDSTMT){
+			ast_free(bl);
+			return NULL;
+		}
+		cursor=src-source;
+		src=source+cursor;
+		tok=nexttoken(&src);
+		if(tok.type==TT_EOF)break;
 	}
 	return bl;
 }
 
-void ast_free(AST *ast_){
-	switch(ast_->type){
-		case AST_BLOCK:{ ASTblock *ast=(ASTblock*)ast_;
-			for(int i=0;i<ast->len;i++)if(ast->exprs[i])ast_free(ast->exprs[i]);
-			free(ast->exprs);
+static const char* charblock(char c,int n){
+	static char *buf=NULL;
+	if(!buf)buf=malloc(n+1);
+	else buf=realloc(buf,n+1);
+	if(!buf)outofmem();
+	memset(buf,c,n);
+	buf[n]='\0';
+	return buf;
+}
+
+#define TABW (4)
+#define INDENT fprintf(stream,"%s",charblock(' ',TABW*indent));
+static void ast_debug_(FILE *stream,const AST *ast,int indent){
+	switch(ast->type){
+		case AST_BLOCK:
+			if(ast->b.len==0){
+				fprintf(stream,"{}");
+				break;
+			}
+			fprintf(stream,"{\n");
+			indent++;
+			for(int i=0;i<ast->b.len;i++){
+				INDENT
+				ast_debug_(stream,ast->b.exprs[i],indent);
+				fputc('\n',stream);
+			}
+			indent--;
+			INDENT
+			fprintf(stream,"}");
+			break;
+
+		case AST_OP:{
+			bool leftp=ast->o.left&&ast->o.left->type==AST_OP&&precedence(ast->o.left->o.op)<=precedence(ast->o.op);
+			bool rightp=ast->o.right&&ast->o.right->type==AST_OP&&precedence(ast->o.right->o.op)<=precedence(ast->o.op);
+			//fprintf(stderr,"[[op='%s' p=%d lp=%d rp=%d]]",ast->o.op,precedence(ast->o.op),leftp,rightp);
+			if(leftp)fputc('(',stream);
+			if(ast->o.left)ast_debug_(stream,ast->o.left,indent);
+			fprintf(stream,"%s%s%s",leftp?")":"",ast->o.op,rightp?"(":"");
+			if(ast->o.right)ast_debug_(stream,ast->o.right,indent);
+			if(rightp)fputc(')',stream);
+			break;
+		}
+
+		case AST_NUM:
+			if(ast->n.isint)fprintf(stream,"%lld",ast->n.i);
+			else fprintf(stream,"%g",ast->n.d);
+			break;
+
+		case AST_STR:
+			fputc('"',stream);
+			for(int i=0;i<ast->s.len;i++){
+				if(ast->s.str[i]<32||ast->s.str[i]>126){
+					fprintf(stream,"\\x%c%c",hexencode(ast->s.str[i]/16),hexencode(ast->s.str[i]%16));
+				} else fputc(ast->s.str[i],stream);
+			}
+			fputc('"',stream);
+			break;
+
+		case AST_VAR:
+			fprintf(stream,"%s",ast->v.name);
+			break;
+
+		case AST_CALL:
+			fprintf(stream,"%s(",ast->c.func);
+			for(int i=0;i<ast->c.nargs;i++){
+				if(i!=0)fputc(',',stream);
+				ast_debug_(stream,ast->c.args[i],indent);
+			}
+			fputc(')',stream);
+			break;
+
+		case AST_IF:
+			assert(NOT_IMPLEMENTED);
+			break;
+
+		case AST_WHILE:
+			assert(NOT_IMPLEMENTED);
+			break;
+
+		default:
+			fprintf(stream,"AST_(??\?)");
+			break;
+	}
+}
+
+void ast_debug(FILE *stream,const AST *ast){
+	ast_debug_(stream,ast,0);
+	fputc('\n',stream);
+}
+
+void ast_free(AST *ast){
+	switch(ast->type){
+		case AST_BLOCK:{
+			for(int i=0;i<ast->b.len;i++)if(ast->b.exprs[i])ast_free(ast->b.exprs[i]);
+			free(ast->b.exprs);
 			break;
 		}
 
-		case AST_OP:{ ASTop *ast=(ASTop*)ast_;
-			if(ast->left)ast_free(ast->left);
-			if(ast->right)ast_free(ast->right);
+		case AST_OP:{
+			if(ast->o.left)ast_free(ast->o.left);
+			if(ast->o.right)ast_free(ast->o.right);
 			break;
 		}
 
 		case AST_NUM:
 			break;
 
-		case AST_STR:{ ASTstr *ast=(ASTstr*)ast_;
-			if(ast->str)free(ast->str);
+		case AST_STR:{
+			if(ast->s.str)free(ast->s.str);
 			break;
 		}
 
-		case AST_VAR:{ ASTvar *ast=(ASTvar*)ast_;
-			if(ast->name)free(ast->name);
+		case AST_VAR:{
+			if(ast->v.name)free(ast->v.name);
 			break;
 		}
 
-		case AST_CALL:{ ASTcall *ast=(ASTcall*)ast_;
-			if(ast->func)free(ast->func);
-			for(int i=0;i<ast->nargs;i++)if(ast->args[i])ast_free(ast->args[i]);
-			free(ast->args);
+		case AST_CALL:{
+			if(ast->c.func)free(ast->c.func);
+			for(int i=0;i<ast->c.nargs;i++)if(ast->c.args[i])ast_free(ast->c.args[i]);
+			free(ast->c.args);
 			break;
 		}
 
-		case AST_IF:{ ASTif *ast=(ASTif*)ast_;
-			if(ast->cond)free(ast->cond);
-			if(ast->thenb)free(ast->thenb);
-			if(ast->elseb)free(ast->elseb);
+		case AST_IF:{
+			if(ast->i.cond)free(ast->i.cond);
+			if(ast->i.thenb)free(ast->i.thenb);
+			if(ast->i.elseb)free(ast->i.elseb);
 			break;
 		}
 
-		case AST_WHILE:{ ASTwhile *ast=(ASTwhile*)ast_;
-			if(ast->cond)free(ast->cond);
-			if(ast->body)free(ast->body);
+		case AST_WHILE:{
+			if(ast->w.cond)free(ast->w.cond);
+			if(ast->w.body)free(ast->w.body);
 			break;
 		}
 	}
-	free(ast_);
+	free(ast);
 }
diff --git a/parser.h b/parser.h
index 7d9b347..61d6dfe 100644
--- a/parser.h
+++ b/parser.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <stdio.h>
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -86,5 +87,6 @@ typedef enum Associativity{
 } Associativity;
 
 
-ASTblock* parse(const char *source);
+AST* parse(const char *source);
+void ast_debug(FILE *stream,const AST *ast);
 void ast_free(AST *ast);
author	tomsmeding <tom.smeding@gmail.com>	2016-08-05 20:26:05 +0200
committer	tomsmeding <tom.smeding@gmail.com>	2016-08-06 10:10:32 +0200
commit	f67988fbfde6ad8a91466ef5d4227dcf9e5db6ce (patch)
tree	db85d3936f717331c3eeed4fae43e5ed43324be9
parent	e6bb770a52980ef3d85c2d4b93fb240c026ce7f7 (diff)