diff options
Diffstat (limited to 'parser.c')
-rw-r--r-- | parser.c | 538 |
1 files changed, 430 insertions, 108 deletions
@@ -2,193 +2,515 @@ #include <stdlib.h> #include <string.h> #include <ctype.h> +#include <assert.h> #include "memory.h" +#include "opfuncs.h" #include "parser.h" +#define NOT_IMPLEMENTED false + + +static bool ishexdigit(char c){ + return (c>='0'&&c<='9')||(c>='a'&&c<='f')||(c>='A'&&c<='F'); +} + +static int hexnumber(char c){ + return c<='9'?c-'0':(c&~('a'-'A'))-'A'+10; +} + +static char hexencode(int n){ + return n<10?n+'0':n+'a'; +} + + typedef enum Tokentype{ TT_NUM, TT_STR, TT_WORD, - TT_SYM + TT_OP, + TT_SYM, //all symbols that are not operators + TT_ENDSTMT, + TT_EOF, + + TT_ERR=-1 } Tokentype; typedef struct Token{ - const char *str; + Tokentype type; + const char *str; //Part of another string; not null-terminated, and do not free int len; } Token; -Token nexttoken(const char **sourcep){ + +static bool parsecomment(const char **sourcep){ + const char *source=*sourcep; + if(*source!='#')return false; + if(source[1]=='#'&&source[2]=='#'){ + source+=3; + while(*source&& + (*source!='#'||source[1]!='#'||source[2]!='#')){ + source++; + } + if(!*source)return false; //unclosed block comment + source+=2; + } else { + while(*source&&*source!='\n')source++; + if(*source)source++; + } + *sourcep=source; + return true; +} + +static void skipintermediate(const char **sourcep){ + const char *source=*sourcep; + bool acted; + do { + acted=false; + while(isspace(*source)){ + source++; + acted=true; + } + if(parsecomment(&source)){ + acted=true; + } + } while(acted); + *sourcep=source; +} + +static Token nexttoken(const char **sourcep){ + skipintermediate(sourcep); const char *source=*sourcep; - while(isspace(*source))source++; + if(*source=='\0'){ + Token tok={TT_EOF,NULL,-1}; + return tok; + } + if(*source==';'){ + Token tok={TT_ENDSTMT,source,1}; + (*sourcep)++; + return tok; + } if(isdigit(*source)||(*source=='-'&&isdigit(source[1]))){ char *endp; strtod(source,&endp); assert(endp!=source); - Token tok={source,endp-source}; + Token tok={TT_NUM,source,endp-source}; + *sourcep=endp; return tok; } -} - - -int precedence(const char *op){ - switch(op[0]){ - case '!': return op[1]=='\0' ? 12 : op[1]=='='&&op[2]=='\0' ? 5 : -1; - case '%': return op[1]=='\0' ? 11 : -1; - case '&': return op[1]=='\0' ? 9 : op[1]=='&'&&op[2]=='\0' ? 4 : -1; - case '*': return op[1]=='\0' ? 11 : op[1]=='*'&&op[2]=='\0' ? 14 : -1; - case '+': return op[1]=='\0' ? 10 : -1; - case '-': return op[1]=='\0' ? 10 : -1; - case '/': return op[1]=='\0' ? 11 : op[1]=='/'&&op[2]=='\0' ? 11 : -1; - case '<': return op[1]=='\0' ? 6 : op[1]=='='&&op[2]=='\0' ? 6 : -1; - case '=': return op[1]=='\0' ? 1 : op[1]=='='&&op[2]=='\0' ? 5 : -1; - case '>': return op[1]=='\0' ? 6 : op[1]=='='&&op[2]=='\0' ? 6 : -1; - case '^': return op[1]=='\0' ? 8 : op[1]=='^'&&op[2]=='\0' ? 3 : -1; - case '|': return op[1]=='\0' ? 7 : op[1]=='|'&&op[2]=='\0' ? 2 : -1; - case '~': return op[1]=='\0' ? 12 : -1; - default: return -1; + if(*source=='"'){ + int i; + for(i=1;source[i]&&source[i]!='"';i++){ + if(source[i]=='\\')i++; + } + if(!source[i]){ + Token tok={TT_ERR,"Non-terminated string",21}; + return tok; + } + *sourcep+=i+1; + Token tok={TT_STR,source,i+1}; + return tok; } -} - -int associativity(const char *op){ - switch(op[0]){ - case '!': return op[1]=='\0' ? AS_PREFIX : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1; - case '%': return op[1]=='\0' ? AS_LEFT : -1; - case '&': return op[1]=='\0' ? AS_LEFT : op[1]=='&'&&op[2]=='\0' ? AS_LEFT : -1; - case '*': return op[1]=='\0' ? AS_LEFT : op[1]=='*'&&op[2]=='\0' ? AS_RIGHT : -1; - case '+': return op[1]=='\0' ? AS_LEFT : -1; - case '-': return op[1]=='\0' ? AS_LEFT : -1; - case '/': return op[1]=='\0' ? AS_LEFT : op[1]=='/'&&op[2]=='\0' ? AS_LEFT : -1; - case '<': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1; - case '=': return op[1]=='\0' ? AS_RIGHT : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1; - case '>': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1; - case '^': return op[1]=='\0' ? AS_LEFT : op[1]=='^'&&op[2]=='\0' ? AS_LEFT : -1; - case '|': return op[1]=='\0' ? AS_LEFT : op[1]=='|'&&op[2]=='\0' ? AS_LEFT : -1; - case '~': return op[1]=='\0' ? AS_PREFIX : -1; - default: return -1; + int oplen=parseoplength(source); + if(oplen!=-1){ + Token tok={TT_OP,source,oplen}; + *sourcep+=oplen; + return tok; + } + if(strchr("(){}",*source)!=NULL){ + Token tok={TT_SYM,source,1}; + (*sourcep)++; + return tok; + } + if(isalpha(*source)||*source=='_'){ + int i; + for(i=1;source[i];i++){ + if(!isalpha(source[i])&&!isdigit(source[i])&&source[i]!='_')break; + } + Token tok={TT_WORD,source,i}; + *sourcep+=i; + return tok; } + Token tok={TT_ERR,"Unrecognised token",18}; + return tok; } -static bool parsecomment(const char *source,int *reslen){ - int cursor=0; - if(source[cursor]!='#')return false; - if(source[cursor+1]=='#'&&source[cursor+2]=='#'){ - cursor+=3; - while(source[cursor]&& - (source[cursor]!='#'||source[cursor+1]!='#'||source[cursor+2]!='#')){ - cursor++; - } - if(!source[cursor])return false; //unclosed block comment - cursor+=2; +static void printtoken(FILE *stream,Token tok,const char *msg){ + const char *type; + switch(tok.type){ + case TT_NUM: type="TT_NUM"; break; + case TT_STR: type="TT_STR"; break; + case TT_WORD: type="TT_WORD"; break; + case TT_OP: type="TT_OP"; break; + case TT_SYM: type="TT_SYM"; break; + case TT_ENDSTMT: type="TT_ENDSTMT"; break; + case TT_EOF: type="TT_EOF"; break; + case TT_ERR: type="TT_ERR"; break; + default: type="TT_(??\?)"; break; //TRIGRAPHS ._. + } + if(tok.len!=-1){ + char buf[tok.len+1]; + memcpy(buf,tok.str,tok.len); + buf[tok.len]='\0'; + fprintf(stream,"(%s) Token: %s '%s'\n",msg,type,buf); } else { - while(source[cursor]&&source[cursor]!='\n')cursor++; - if(source[cursor])cursor++; + fprintf(stream,"(%s) Token: %s (null)\n",msg,type); } - *reslen=cursor; - return true; } -static void parseintermediate(const char *source,int *reslen){ - int cursor=0; - bool acted; - do { - acted=false; - while(source[cursor]&&isspace(source[cursor])){ - cursor++; - acted=true; + +static AST* parseterm(const char *source,int *reslen){ + const char *origsource=source; + const Token tok=nexttoken(&source); + printtoken(stderr,tok,"parseterm"); + AST *node; + switch(tok.type){ + case TT_NUM:{ + node=malloc(sizeof(AST)); + if(!node)outofmem(); + node->type=AST_NUM; + char *endp; + int intv=strtol(tok.str,&endp,0); + node->n.isint=endp-tok.str==tok.len; + if(node->n.isint)node->n.i=intv; + else node->n.d=strtod(tok.str,NULL); + break; } - int partlen; - if(parsecomment(source+cursor,&partlen)){ - cursor+=partlen; - acted=true; + + case TT_STR:{ + int slen=0; + for(int i=1;i<tok.len-1;i++){ + slen++; + if(tok.str[i]!='\\')continue; + i++; + if(tok.str[i]=='x'){ + if(i+2>=tok.len-1||!ishexdigit(tok.str[i+1])||!ishexdigit(tok.str[i+2])){ + return NULL; + } + i+=2; + } else { + i++; + } + } + node=malloc(sizeof(AST)); + if(!node)outofmem(); + node->type=AST_STR; + node->s.str=malloc(slen+1); + if(!node->s.str)outofmem(); + int j=0; + for(int i=1;i<tok.len-1;i++){ + if(tok.str[i]!='\\'){ + node->s.str[j++]=tok.str[i]; + continue; + } + i++; + switch(tok.str[i]){ + case 'n': node->s.str[j++]='\n'; break; + case 'r': node->s.str[j++]='\r'; break; + case 't': node->s.str[j++]='\t'; break; + case 'b': node->s.str[j++]='\b'; break; + case 'a': node->s.str[j++]='\a'; break; + case 'x': + node->s.str[j++]=16*hexnumber(tok.str[i+1])+hexnumber(tok.str[i+2]); + i+=2; + break; + default: + node->s.str[j++]=tok.str[i]; + break; + } + } + node->s.str[j]='\0'; + break; } - } while(acted); - *reslen=cursor; + + case TT_WORD:{ + if(tok.len==2&&memcmp(tok.str,"if",2)==0)assert(NOT_IMPLEMENTED); + if(tok.len==5&&memcmp(tok.str,"while",2)==0)assert(NOT_IMPLEMENTED); + const char *tempsource=source; + Token next=nexttoken(&source); + if(next.len==1&&next.str[0]=='(')assert(NOT_IMPLEMENTED); + source=tempsource; + node=malloc(sizeof(AST)); + if(!node)outofmem(); + node->type=AST_VAR; + node->v.name=malloc(tok.len+1); + if(!node->v.name)outofmem(); + memcpy(node->v.name,tok.str,tok.len); + node->v.name[tok.len]='\0'; + break; + } + + case TT_SYM: + assert(NOT_IMPLEMENTED); + break; + + case TT_OP:{ + char buf[tok.len+3]; + buf[0]='('; + memcpy(buf+1,tok.str,tok.len); + buf[tok.len+1]=')'; + buf[tok.len+2]='\0'; + if(associativity(buf)==AS_PREFIX){ + node=malloc(sizeof(AST)); + if(!node)outofmem(); + node->type=AST_OP; + node->o.op=opconststring_len(buf,tok.len+2); + node->o.left=NULL; + int len; + node->o.right=parseterm(source,&len); + if(!node->o.right){ + free(node); + return NULL; + } + source+=len; + } else return NULL; + break; + } + + case TT_ENDSTMT: + case TT_EOF: + case TT_ERR: + return NULL; + } + *reslen=source-origsource; + return node; } +//Uses precedence climbing static AST* parseexpr(const char *source,int *reslen,int minprec){ - ; + const char *origsource=source; + int len; + AST *tree=parseterm(source,&len); + if(!tree)return NULL; + source+=len; + while(true){ + const char *beforeop=source; + Token tok=nexttoken(&source); + printtoken(stderr,tok,"parseEXPR"); + if(tok.type==TT_ENDSTMT){ + source=beforeop; + break; + } + if(tok.type!=TT_OP){ + ast_free(tree); + return NULL; + } + int prec=precedence_len(tok.str,tok.len); + if(prec<minprec){ + source=beforeop; + break; + } + Associativity assoc=associativity_len(tok.str,tok.len); + int q; + switch(assoc){ + case AS_PREFIX: case AS_SUFFIX: + ast_free(tree); + return NULL; + + case AS_LEFT: q=prec+1; break; + case AS_RIGHT: q=prec; break; + case AS_NONASSOC: q=prec+1; minprec=prec+1; break; + + default: assert(false); + } + AST *right=parseexpr(source,&len,q); + if(!right){ + ast_free(tree); + return NULL; + } + source+=len; + AST *opnode=malloc(sizeof(AST)); + if(!opnode)outofmem(); + opnode->type=AST_OP; + opnode->o.op=opconststring_len(tok.str,tok.len); + if(!opnode->o.op)outofmem(); + opnode->o.left=tree; + opnode->o.right=right; + tree=opnode; + } + *reslen=source-origsource; + return tree; } static AST* parsestmt(const char *source,int *reslen){ return parseexpr(source,reslen,0); } -ASTblock* parse(const char *source){ - ASTblock *bl=malloc(sizeof(ASTblock)); +AST* parse(const char *source){ + AST *bl=malloc(sizeof(AST)); + if(!bl)outofmem(); + bl->type=AST_BLOCK; int sz=32; - bl->len=0; - bl->exprs=calloc(sz,sizeof(AST*)); - if(!bl->exprs)outofmem(); + bl->b.len=0; + bl->b.exprs=calloc(sz,sizeof(AST*)); + if(!bl->b.exprs)outofmem(); int reslen; int cursor=0; while(true){ - if(bl->len==sz){ + if(bl->b.len==sz){ sz*=2; - bl->exprs=realloc(bl->exprs,sz*sizeof(AST*)); - if(!bl->exprs)outofmem(); + bl->b.exprs=realloc(bl->b.exprs,sz*sizeof(AST*)); + if(!bl->b.exprs)outofmem(); } - parseintermediate(source+cursor,&reslen); - if(!source[cursor])break; AST *node=parsestmt(source+cursor,&reslen); if(!node){ - ast_free((AST*)bl); + ast_free(bl); return NULL; } - bl->exprs[bl->len++]=node; + bl->b.exprs[bl->b.len++]=node; cursor+=reslen; + const char *src=source+cursor; + Token tok=nexttoken(&src); + if(tok.type!=TT_ENDSTMT){ + ast_free(bl); + return NULL; + } + cursor=src-source; + src=source+cursor; + tok=nexttoken(&src); + if(tok.type==TT_EOF)break; } return bl; } -void ast_free(AST *ast_){ - switch(ast_->type){ - case AST_BLOCK:{ ASTblock *ast=(ASTblock*)ast_; - for(int i=0;i<ast->len;i++)if(ast->exprs[i])ast_free(ast->exprs[i]); - free(ast->exprs); +static const char* charblock(char c,int n){ + static char *buf=NULL; + if(!buf)buf=malloc(n+1); + else buf=realloc(buf,n+1); + if(!buf)outofmem(); + memset(buf,c,n); + buf[n]='\0'; + return buf; +} + +#define TABW (4) +#define INDENT fprintf(stream,"%s",charblock(' ',TABW*indent)); +static void ast_debug_(FILE *stream,const AST *ast,int indent){ + switch(ast->type){ + case AST_BLOCK: + if(ast->b.len==0){ + fprintf(stream,"{}"); + break; + } + fprintf(stream,"{\n"); + indent++; + for(int i=0;i<ast->b.len;i++){ + INDENT + ast_debug_(stream,ast->b.exprs[i],indent); + fputc('\n',stream); + } + indent--; + INDENT + fprintf(stream,"}"); + break; + + case AST_OP:{ + bool leftp=ast->o.left&&ast->o.left->type==AST_OP&&precedence(ast->o.left->o.op)<=precedence(ast->o.op); + bool rightp=ast->o.right&&ast->o.right->type==AST_OP&&precedence(ast->o.right->o.op)<=precedence(ast->o.op); + //fprintf(stderr,"[[op='%s' p=%d lp=%d rp=%d]]",ast->o.op,precedence(ast->o.op),leftp,rightp); + if(leftp)fputc('(',stream); + if(ast->o.left)ast_debug_(stream,ast->o.left,indent); + fprintf(stream,"%s%s%s",leftp?")":"",ast->o.op,rightp?"(":""); + if(ast->o.right)ast_debug_(stream,ast->o.right,indent); + if(rightp)fputc(')',stream); + break; + } + + case AST_NUM: + if(ast->n.isint)fprintf(stream,"%lld",ast->n.i); + else fprintf(stream,"%g",ast->n.d); + break; + + case AST_STR: + fputc('"',stream); + for(int i=0;i<ast->s.len;i++){ + if(ast->s.str[i]<32||ast->s.str[i]>126){ + fprintf(stream,"\\x%c%c",hexencode(ast->s.str[i]/16),hexencode(ast->s.str[i]%16)); + } else fputc(ast->s.str[i],stream); + } + fputc('"',stream); + break; + + case AST_VAR: + fprintf(stream,"%s",ast->v.name); + break; + + case AST_CALL: + fprintf(stream,"%s(",ast->c.func); + for(int i=0;i<ast->c.nargs;i++){ + if(i!=0)fputc(',',stream); + ast_debug_(stream,ast->c.args[i],indent); + } + fputc(')',stream); + break; + + case AST_IF: + assert(NOT_IMPLEMENTED); + break; + + case AST_WHILE: + assert(NOT_IMPLEMENTED); + break; + + default: + fprintf(stream,"AST_(??\?)"); + break; + } +} + +void ast_debug(FILE *stream,const AST *ast){ + ast_debug_(stream,ast,0); + fputc('\n',stream); +} + +void ast_free(AST *ast){ + switch(ast->type){ + case AST_BLOCK:{ + for(int i=0;i<ast->b.len;i++)if(ast->b.exprs[i])ast_free(ast->b.exprs[i]); + free(ast->b.exprs); break; } - case AST_OP:{ ASTop *ast=(ASTop*)ast_; - if(ast->left)ast_free(ast->left); - if(ast->right)ast_free(ast->right); + case AST_OP:{ + if(ast->o.left)ast_free(ast->o.left); + if(ast->o.right)ast_free(ast->o.right); break; } case AST_NUM: break; - case AST_STR:{ ASTstr *ast=(ASTstr*)ast_; - if(ast->str)free(ast->str); + case AST_STR:{ + if(ast->s.str)free(ast->s.str); break; } - case AST_VAR:{ ASTvar *ast=(ASTvar*)ast_; - if(ast->name)free(ast->name); + case AST_VAR:{ + if(ast->v.name)free(ast->v.name); break; } - case AST_CALL:{ ASTcall *ast=(ASTcall*)ast_; - if(ast->func)free(ast->func); - for(int i=0;i<ast->nargs;i++)if(ast->args[i])ast_free(ast->args[i]); - free(ast->args); + case AST_CALL:{ + if(ast->c.func)free(ast->c.func); + for(int i=0;i<ast->c.nargs;i++)if(ast->c.args[i])ast_free(ast->c.args[i]); + free(ast->c.args); break; } - case AST_IF:{ ASTif *ast=(ASTif*)ast_; - if(ast->cond)free(ast->cond); - if(ast->thenb)free(ast->thenb); - if(ast->elseb)free(ast->elseb); + case AST_IF:{ + if(ast->i.cond)free(ast->i.cond); + if(ast->i.thenb)free(ast->i.thenb); + if(ast->i.elseb)free(ast->i.elseb); break; } - case AST_WHILE:{ ASTwhile *ast=(ASTwhile*)ast_; - if(ast->cond)free(ast->cond); - if(ast->body)free(ast->body); + case AST_WHILE:{ + if(ast->w.cond)free(ast->w.cond); + if(ast->w.body)free(ast->w.body); break; } } - free(ast_); + free(ast); } |