#define _GNU_SOURCE //asprintf #include #include #include #include #include #include #include "memory.h" #include "opfuncs.h" #include "parser.h" #define DEBUG #ifdef DEBUG #define DBG(...) __VA_ARGS__ #else #define DBG(...) #endif #define DBGF(...) DBG(fprintf(stderr,__VA_ARGS__)) #define NOT_IMPLEMENTED false static bool ishexdigit(char c){ return (c>='0'&&c<='9')||(c>='a'&&c<='f')||(c>='A'&&c<='F'); } static int hexnumber(char c){ return c<='9'?c-'0':(c&~('a'-'A'))-'A'+10; } static char hexencode(int n){ return n<10?n+'0':n+'a'; } static bool semicolon_needed_after(const AST *after){ //DBGF("semicolon_needed_after(type %d) -> ",after->type); switch(after->type){ case AST_BLOCK: case AST_IF: case AST_WHILE: case AST_FUNC: //DBGF("false\n"); return false; default: //DBGF("true\n"); return true; } } typedef enum Tokentype{ TT_NUM, TT_STR, TT_WORD, TT_OP, TT_SYM, //all symbols that are not operators TT_ENDSTMT, TT_EOF, TT_ERR=-1 } Tokentype; typedef struct Token{ Tokentype type; const char *str; //Pointer into source string, or NULL if TT_EOF int len; } Token; static bool parsecomment(const char **sourcep){ const char *source=*sourcep; if(*source!='#')return false; if(source[1]=='#'&&source[2]=='#'){ source+=3; while(*source&& (*source!='#'||source[1]!='#'||source[2]!='#')){ source++; } if(!*source)return false; //unclosed block comment source+=2; } else { while(*source&&*source!='\n')source++; if(*source)source++; } *sourcep=source; return true; } static void skipintermediate(const char **sourcep){ const char *source=*sourcep; bool acted; do { acted=false; while(isspace(*source)){ source++; acted=true; } if(parsecomment(&source)){ acted=true; } } while(acted); *sourcep=source; } static Token lasttoken; static const char *lasterrloc; static Token nexttoken(const char **sourcep,bool expectop){ skipintermediate(sourcep); const char *source=*sourcep; if(*source=='\0'){ Token tok={TT_EOF,NULL,-1}; return lasttoken=tok; } if(*source==';'){ Token tok={TT_ENDSTMT,source,1}; (*sourcep)++; return lasttoken=tok; } if(isdigit(*source)||(!expectop&&*source=='-'&&isdigit(source[1]))){ char *endp; strtod(source,&endp); assert(endp!=source); Token tok={TT_NUM,source,endp-source}; *sourcep=endp; return lasttoken=tok; } if(*source=='"'){ int i; for(i=1;source[i]&&source[i]!='"';i++){ if(source[i]=='\\')i++; } if(!source[i]){ Token tok={TT_ERR,"Non-terminated string",21}; return lasttoken=tok; } *sourcep+=i+1; Token tok={TT_STR,source,i+1}; return lasttoken=tok; } int oplen=parseoplength(source); if(oplen!=-1){ Token tok={TT_OP,source,oplen}; *sourcep+=oplen; return lasttoken=tok; } if(!expectop){ char buf[4]={'(',*source,')','\0'}; if(precedence(buf)!=-1){ Token tok={TT_OP,source,1}; (*sourcep)++; return lasttoken=tok; } } if(strchr("(){},",*source)!=NULL){ Token tok={TT_SYM,source,1}; (*sourcep)++; return lasttoken=tok; } if(isalpha(*source)||*source=='_'){ int i; for(i=1;source[i];i++){ if(!isalpha(source[i])&&!isdigit(source[i])&&source[i]!='_')break; } Token tok={TT_WORD,source,i}; *sourcep+=i; return lasttoken=tok; } Token tok={TT_ERR,"Unrecognised token",18}; lasterrloc=source; return lasttoken=tok; } #ifdef DEBUG static void printtoken(FILE *stream,Token tok,const char *msg){ const char *type; switch(tok.type){ case TT_NUM: type="TT_NUM"; break; case TT_STR: type="TT_STR"; break; case TT_WORD: type="TT_WORD"; break; case TT_OP: type="TT_OP"; break; case TT_SYM: type="TT_SYM"; break; case TT_ENDSTMT: type="TT_ENDSTMT"; break; case TT_EOF: type="TT_EOF"; break; case TT_ERR: type="TT_ERR"; break; default: type="TT_(??\?)"; break; //TRIGRAPHS ._. } if(tok.len!=-1){ char buf[tok.len+1]; memcpy(buf,tok.str,tok.len); buf[tok.len]='\0'; fprintf(stream,"(%s) Token: %s '%s'\n",msg,type,buf); } else { fprintf(stream,"(%s) Token: %s (null)\n",msg,type); } } #endif static const char *builtin_keywords[]={ "break", "continue", "return" }; static const int nbuiltin_keywords=sizeof(builtin_keywords)/sizeof(builtin_keywords[0]); static AST* parseexpr(const char *source,int *reslen,int minprec,int maxprec); static AST* parseterm(const char *source,int *reslen){ const char *origsource=source; const Token tok=nexttoken(&source,false); DBG(printtoken(stderr,tok,"parseterm")); AST *node; switch(tok.type){ case TT_NUM:{ node=malloc(sizeof(AST)); if(!node)outofmem(); node->type=AST_NUM; char *endp; int intv=strtol(tok.str,&endp,0); node->n.isint=endp-tok.str==tok.len; if(node->n.isint)node->n.i=intv; else node->n.d=strtod(tok.str,NULL); break; } case TT_STR:{ int slen=0; for(int i=1;i=tok.len-1||!ishexdigit(tok.str[i+1])||!ishexdigit(tok.str[i+2])){ return NULL; } i+=2; } } node=malloc(sizeof(AST)); if(!node)outofmem(); node->type=AST_STR; node->s.len=slen; node->s.str=malloc(slen+1); if(!node->s.str)outofmem(); int j=0; for(int i=1;is.str[j++]=tok.str[i]; continue; } i++; switch(tok.str[i]){ case 'n': node->s.str[j++]='\n'; break; case 'r': node->s.str[j++]='\r'; break; case 't': node->s.str[j++]='\t'; break; case 'b': node->s.str[j++]='\b'; break; case 'a': node->s.str[j++]='\a'; break; case 'x': node->s.str[j++]=16*hexnumber(tok.str[i+1])+hexnumber(tok.str[i+2]); i+=2; break; default: node->s.str[j++]=tok.str[i]; break; } } node->s.str[j]='\0'; break; } case TT_WORD:{ if(tok.len==2&&memcmp(tok.str,"if",2)==0){ int len; AST *cond=parseexpr(source,&len,0,INT_MAX); if(!cond)return NULL; source+=len; AST *thenbody=parseexpr(source,&len,0,INT_MAX); if(!thenbody){ ast_free(cond); return NULL; } source+=len; if(semicolon_needed_after(thenbody)){ Token sctok=nexttoken(&source,false); if(sctok.type!=TT_ENDSTMT){ ast_free(cond); ast_free(thenbody); return NULL; } DBG(printtoken(stderr,sctok,"afterthen")); } AST *elsebody=NULL; const char *src=source; Token elsetok=nexttoken(&src,false); if(elsetok.type==TT_WORD&&elsetok.len==4&&memcmp(elsetok.str,"else",4)==0){ DBG(printtoken(stderr,elsetok,"if else ")); source=src; elsebody=parseexpr(source,&len,0,INT_MAX); if(!elsebody){ ast_free(cond); ast_free(thenbody); return NULL; } source+=len; if(semicolon_needed_after(elsebody)){ Token sctok=nexttoken(&source,false); if(sctok.type!=TT_ENDSTMT){ ast_free(cond); ast_free(thenbody); ast_free(elsebody); return NULL; } DBG(printtoken(stderr,sctok,"afterelse")); } } node=malloc(sizeof(AST)); if(!node)outofmem(); node->type=AST_IF; node->i.cond=cond; node->i.thenb=thenbody; node->i.elseb=elsebody; break; } if(tok.len==5&&memcmp(tok.str,"while",2)==0){ int len; AST *cond=parseexpr(source,&len,0,INT_MAX); if(!cond)return NULL; source+=len; AST *body=parseexpr(source,&len,0,INT_MAX); if(!body){ ast_free(cond); return NULL; } source+=len; if(semicolon_needed_after(body)){ Token sctok=nexttoken(&source,false); if(sctok.type!=TT_ENDSTMT){ ast_free(cond); ast_free(body); return NULL; } DBG(printtoken(stderr,sctok,"afterwhbd")); } node=malloc(sizeof(AST)); if(!node)outofmem(); node->type=AST_WHILE; node->w.cond=cond; node->w.body=body; break; } const char *tempsource=source; Token next=nexttoken(&source,false); if(next.len==1&&next.str[0]=='('){ DBG(printtoken(stderr,next,"function ")); AST **args=malloc(sizeof(AST*)); int nargs=0; int len; #define FREEARGSRETNULL do {for(int i=0;itype=AST_CALL; int i; for(i=0;ic.func=malloc(tok.len+prefixlen+1); if(!node->c.func)outofmem(); memcpy(node->c.func,"__builtin_",prefixlen); memcpy(node->c.func+prefixlen,tok.str,tok.len); node->c.func[prefixlen+tok.len]='\0'; } else { node->c.func=malloc(tok.len+1); if(!node->c.func)outofmem(); memcpy(node->c.func,tok.str,tok.len); node->c.func[tok.len]='\0'; } node->c.nargs=nargs; node->c.args=args; #undef FREEARGSRETNULL } else { source=tempsource; node=malloc(sizeof(AST)); if(!node)outofmem(); int i; for(i=0;itype=AST_CALL; int prefixlen=strlen("__builtin_"); node->c.func=malloc(tok.len+prefixlen+1); if(!node->c.func)outofmem(); memcpy(node->c.func,"__builtin_",prefixlen); memcpy(node->c.func+prefixlen,tok.str,tok.len); node->c.func[prefixlen+tok.len]='\0'; node->c.nargs=0; node->c.args=malloc(1); } else { node->type=AST_VAR; node->v.name=malloc(tok.len+1); if(!node->v.name)outofmem(); memcpy(node->v.name,tok.str,tok.len); node->v.name[tok.len]='\0'; } } break; } case TT_SYM: if(tok.len==1&&tok.str[0]=='{'){ node=malloc(sizeof(AST)); if(!node)outofmem(); node->type=AST_BLOCK; int sz=32; node->b.len=0; node->b.exprs=calloc(sz,sizeof(AST*)); if(!node->b.exprs)outofmem(); int len; int cursor=0; while(true){ if(node->b.len==sz){ sz*=2; node->b.exprs=realloc(node->b.exprs,sz*sizeof(AST*)); if(!node->b.exprs)outofmem(); } AST *n=parseexpr(source+cursor,&len,0,INT_MAX); if(!n){ ast_free(node); return NULL; } node->b.exprs[node->b.len++]=n; cursor+=len; if(semicolon_needed_after(n)){ const char *src=source+cursor; Token sctok=nexttoken(&src,false); DBG(printtoken(stderr,sctok,"block ; ")); if(sctok.type!=TT_ENDSTMT){ ast_free(node); return NULL; } cursor=src-source; } const char *src=source+cursor; Token brtok=nexttoken(&src,false); if(brtok.type==TT_SYM&&brtok.len==1&&brtok.str[0]=='}'){ source=src; DBG(printtoken(stderr,brtok,"block } ")); break; } } } else if(tok.len==1&&tok.str[0]=='('){ int len; node=parseexpr(source,&len,0,INT_MAX); if(!node)return NULL; source+=len; Token aftertok=nexttoken(&source,false); DBG(printtoken(stderr,aftertok,"parenclose")); if(aftertok.type!=TT_SYM||aftertok.len!=1||aftertok.str[0]!=')'){ ast_free(node); return NULL; } } else { return NULL; } break; case TT_OP:{ char buf[tok.len+3]; buf[0]='('; memcpy(buf+1,tok.str,tok.len); buf[tok.len+1]=')'; buf[tok.len+2]='\0'; if(associativity(buf)==AS_PREFIX){ node=malloc(sizeof(AST)); if(!node)outofmem(); node->type=AST_OP; node->o.op=opconststring_len(buf,tok.len+2); node->o.left=NULL; int len; node->o.right=parseexpr(source,&len,precedence(buf),INT_MAX); if(!node->o.right){ free(node); return NULL; } source+=len; } else return NULL; break; } case TT_ENDSTMT: case TT_EOF: case TT_ERR: return NULL; } *reslen=source-origsource; return node; } //Uses precedence climbing; maxprec is INT_MAX unless dealing with nonassociative operators static AST* parseexpr_(const char *source,int *reslen,int minprec,int maxprec){ const char *origsource=source; int len; AST *tree=parseterm(source,&len); if(!tree)return NULL; source+=len; while(true){ const char *beforeop=source; Token tok=nexttoken(&source,true); DBG(printtoken(stderr,tok,"parseEXPR")); if(tok.type==TT_ENDSTMT){ DBGF(" (token undo :%d)\n",__LINE__); source=beforeop; break; } if(tok.type==TT_SYM&&tok.len==1&&(tok.str[0]==')'||tok.str[0]==',')){ DBGF(" (token undo :%d)\n",__LINE__); source=beforeop; break; } if(tok.type!=TT_OP){ /*ast_free(tree); return NULL;*/ DBGF(" (token undo :%d)\n",__LINE__); source=beforeop; break; } int prec=precedence_len(tok.str,tok.len); if(precmaxprec){ ast_free(tree); return NULL; } Associativity assoc=associativity_len(tok.str,tok.len); int q; switch(assoc){ case AS_PREFIX: case AS_SUFFIX: ast_free(tree); return NULL; case AS_LEFT: q=prec+1; break; case AS_RIGHT: q=prec; break; case AS_NONASSOC: q=prec+1; maxprec=prec-1; break; default: assert(false); } AST *right=parseexpr(source,&len,q,maxprec); if(!right){ ast_free(tree); return NULL; } source+=len; AST *opnode=malloc(sizeof(AST)); if(!opnode)outofmem(); opnode->type=AST_OP; opnode->o.op=opconststring_len(tok.str,tok.len); if(!opnode->o.op)outofmem(); opnode->o.left=tree; opnode->o.right=right; tree=opnode; } *reslen=source-origsource; return tree; } static AST* parseexpr(const char *source,int *reslen,int minprec,int maxprec){ static int depth=0; DBGF("\x1B[32mEXPR ENTER >>> (%d)\x1B[0m\n",depth); depth++; AST *r=parseexpr_(source,reslen,minprec,maxprec); depth--; DBGF("\x1B[32mEXPR LEAVE <<< (%d) (-> %p)\x1B[0m\n",depth,r); return r; } static AST* parsefunction(const char *source,int *reslen){ const char *origsource=source; Token tok=nexttoken(&source,false); if(tok.type!=TT_WORD||tok.len!=3||memcmp(tok.str,"def",3)!=0){ return NULL; } DBG(printtoken(stderr,tok,"func def ")); tok=nexttoken(&source,false); if(tok.type!=TT_WORD)return NULL; DBG(printtoken(stderr,tok,"func name")); AST *func=malloc(sizeof(AST)); if(!func)outofmem(); func->type=AST_FUNC; func->f.name=malloc(tok.len+1); if(!func->f.name)outofmem(); memcpy(func->f.name,tok.str,tok.len); func->f.name[tok.len]='\0'; func->f.nargs=0; int argssz=2; func->f.args=malloc(argssz*sizeof(char*)); if(!func->f.args)outofmem(); func->f.body=NULL; tok=nexttoken(&source,false); if(tok.type!=TT_SYM||tok.len!=1||tok.str[0]!='('){ ast_free(func); return NULL; } DBG(printtoken(stderr,tok,"func(open")); while(true){ tok=nexttoken(&source,false); if(tok.type==TT_SYM&&tok.len==1&&tok.str[0]==')'){ DBG(printtoken(stderr,tok,"func)clse")); break; } if(tok.type!=TT_WORD){ ast_free(func); return NULL; } DBG(printtoken(stderr,tok,"func arg ")); if(func->f.nargs==argssz){ argssz*=2; func->f.args=realloc(func->f.args,argssz*sizeof(char*)); if(!func->f.args)outofmem(); } char *arg=malloc(tok.len+1); if(!arg)outofmem(); memcpy(arg,tok.str,tok.len); arg[tok.len]='\0'; func->f.args[func->f.nargs++]=arg; tok=nexttoken(&source,false); DBG(printtoken(stderr,tok,"func sep ")); if(tok.type!=TT_SYM||tok.len!=1||(tok.str[0]!=','&&tok.str[0]!=')')){ ast_free(func); return NULL; } if(tok.str[0]==')')break; } int len; func->f.body=parseexpr(source,&len,0,INT_MAX); if(!func->f.body){ ast_free(func); return NULL; } source+=len; *reslen=source-origsource; return func; } static AST* parseprogram(const char *source,int *reslen){ const char *origsource=source; AST *prog=malloc(sizeof(AST)); if(!prog)outofmem(); prog->type=AST_PROGRAM; prog->p.nfuncs=0; int sz=2; prog->p.funcs=malloc(sz*sizeof(AST*)); if(!prog->p.funcs)outofmem(); while(true){ const char *src=source; Token tok=nexttoken(&src,false); if(tok.type==TT_EOF)break; int len; AST *func=parsefunction(source,&len); if(!func){ ast_free(prog); return NULL; } prog->p.funcs[prog->p.nfuncs++]=func; source+=len; } *reslen=source-origsource; return prog; } static char* reportparseerror(const char *source){ const char *loc=lasttoken.type==TT_ERR?lasterrloc:lasttoken.str; if(loc==NULL){ char *buf; asprintf(&buf,"\x1B[1mParse error: unexpected end of file\x1B[0m"); return buf; } assert(loc>=source); int i; for(i=0;i<100;i++){ if(loc-i==source||loc[-i]=='\n')break; } bool cutoff=i==100; const char *start=loc-i+(cutoff||loc[-i]=='\n'); for(;i<150;i++){ if(start[i]=='\0'||start[i]=='\n')break; } bool endcutoff=i==150; int totallen=17+4*cutoff+i+7+6+4*endcutoff+4; char *errstr=malloc(totallen+1); if(!errstr)outofmem(); int offset=0; memcpy(errstr+offset,"\x1B[1mParse error: ",17); offset+=17; if(cutoff){memcpy(errstr+offset,"... ",4); offset+=4;} memcpy(errstr+offset,start,loc-start); offset+=loc-start; memcpy(errstr+offset,"\x1B[31;4m",7); offset+=7; memcpy(errstr+offset,loc,lasttoken.len); offset+=lasttoken.len; memcpy(errstr+offset,"\x1B[0;1m",6); offset+=6; memcpy(errstr+offset,loc+lasttoken.len,i-(loc-start)-lasttoken.len); offset+=i-(loc-start)-lasttoken.len; memcpy(errstr+offset,"\x1B[0m",4); offset+=4; errstr[offset]='\0'; return errstr; } AST* parse(const char *source,char **errmsg){ *errmsg=NULL; int reslen; AST *pr=parseprogram(source,&reslen); source+=reslen; if(lasttoken.type!=TT_EOF||!pr){ *errmsg=reportparseerror(source); if(pr)ast_free(pr); return NULL; } return pr; } static const char* charblock(char c,int n){ static char *buf=NULL; if(!buf)buf=malloc(n+1); else buf=realloc(buf,n+1); if(!buf)outofmem(); memset(buf,c,n); buf[n]='\0'; return buf; } #define TABW (4) #define INDENT fprintf(stream,"%s",charblock(' ',TABW*indent)); static void ast_debug_(FILE *stream,const AST *ast,int indent){ switch(ast->type){ case AST_BLOCK: if(ast->b.len==0){ fprintf(stream,"{}"); break; } fprintf(stream,"{\n"); indent++; for(int i=0;ib.len;i++){ INDENT ast_debug_(stream,ast->b.exprs[i],indent); if(semicolon_needed_after(ast->b.exprs[i]))fputc(';',stream); fputc('\n',stream); } indent--; INDENT fprintf(stream,"}"); break; case AST_OP:{ bool leftp=ast->o.left&&ast->o.left->type==AST_OP&&precedence(ast->o.left->o.op)<=precedence(ast->o.op); bool rightp=ast->o.right&&ast->o.right->type==AST_OP&&precedence(ast->o.right->o.op)<=precedence(ast->o.op); bool spaces=ast->o.left&&ast->o.right; //fprintf(stderr,"[[op='%s' p=%d lp=%d rp=%d]]",ast->o.op,precedence(ast->o.op),leftp,rightp); if(leftp)fputc('(',stream); if(ast->o.left)ast_debug_(stream,ast->o.left,indent); fprintf(stream,"%s%s%s%s%s",leftp?")":"",spaces?" ":"",ast->o.op,spaces?" ":"",rightp?"(":""); if(ast->o.right)ast_debug_(stream,ast->o.right,indent); if(rightp)fputc(')',stream); break; } case AST_NUM: if(ast->n.isint)fprintf(stream,"%lld",ast->n.i); else fprintf(stream,"%g",ast->n.d); break; case AST_STR: fputc('"',stream); for(int i=0;is.len;i++){ if(ast->s.str[i]<32||ast->s.str[i]>126){ fprintf(stream,"\\x%c%c",hexencode((unsigned char)ast->s.str[i]/16),hexencode((unsigned char)ast->s.str[i]%16)); } else fputc(ast->s.str[i],stream); } fputc('"',stream); break; case AST_VAR: fprintf(stream,"%s",ast->v.name); break; case AST_CALL: fprintf(stream,"%s(",ast->c.func); for(int i=0;ic.nargs;i++){ if(i!=0)fprintf(stream,", "); ast_debug_(stream,ast->c.args[i],indent); } fputc(')',stream); break; case AST_IF: fprintf(stream,"if "); ast_debug_(stream,ast->i.cond,indent); fputc(' ',stream); ast_debug_(stream,ast->i.thenb,indent); if(semicolon_needed_after(ast->i.thenb))fputc(';',stream); if(ast->i.elseb){ fputc('\n',stream); INDENT fprintf(stream,"else "); ast_debug_(stream,ast->i.elseb,indent); if(semicolon_needed_after(ast->i.elseb))fputc(';',stream); } break; case AST_WHILE: fprintf(stream,"while "); ast_debug_(stream,ast->w.cond,indent); fputc(' ',stream); ast_debug_(stream,ast->w.body,indent); if(semicolon_needed_after(ast->w.body))fputc(';',stream); break; case AST_FUNC: fprintf(stream,"def %s(",ast->f.name); for(int j=0;jf.nargs;j++){ if(j!=0)fprintf(stream,", "); fprintf(stream,"%s",ast->f.args[j]); } fprintf(stream,") "); ast_debug_(stream,ast->f.body,indent); break; case AST_PROGRAM: for(int i=0;ip.nfuncs;i++){ if(i!=0){ fputc('\n',stream); INDENT } ast_debug_(stream,ast->p.funcs[i],indent); } break; default: fprintf(stream,"AST_(??\?)"); break; } } void ast_debug(FILE *stream,const AST *ast){ ast_debug_(stream,ast,0); fputc('\n',stream); } void ast_free(AST *ast){ switch(ast->type){ case AST_BLOCK: for(int i=0;ib.len;i++)if(ast->b.exprs[i])ast_free(ast->b.exprs[i]); free(ast->b.exprs); break; case AST_OP: if(ast->o.left)ast_free(ast->o.left); if(ast->o.right)ast_free(ast->o.right); break; case AST_NUM: break; case AST_STR: if(ast->s.str)free(ast->s.str); break; case AST_VAR: if(ast->v.name)free(ast->v.name); break; case AST_CALL: if(ast->c.func)free(ast->c.func); for(int i=0;ic.nargs;i++)if(ast->c.args[i])ast_free(ast->c.args[i]); free(ast->c.args); break; case AST_IF: if(ast->i.cond)ast_free(ast->i.cond); if(ast->i.thenb)ast_free(ast->i.thenb); if(ast->i.elseb)ast_free(ast->i.elseb); break; case AST_WHILE: if(ast->w.cond)ast_free(ast->w.cond); if(ast->w.body)ast_free(ast->w.body); break; case AST_FUNC: if(ast->f.name)free(ast->f.name); for(int i=0;if.nargs;i++)if(ast->f.args[i])free(ast->f.args[i]); free(ast->f.args); if(ast->f.body)ast_free(ast->f.body); break; case AST_PROGRAM: for(int i=0;ip.nfuncs;i++)if(ast->p.funcs[i])ast_free(ast->p.funcs[i]); free(ast->p.funcs); break; } free(ast); }