summaryrefslogtreecommitdiff
path: root/parser.c
diff options
context:
space:
mode:
authortomsmeding <tom.smeding@gmail.com>2016-08-05 20:26:05 +0200
committertomsmeding <tom.smeding@gmail.com>2016-08-06 10:10:32 +0200
commitf67988fbfde6ad8a91466ef5d4227dcf9e5db6ce (patch)
treedb85d3936f717331c3eeed4fae43e5ed43324be9 /parser.c
parente6bb770a52980ef3d85c2d4b93fb240c026ce7f7 (diff)
Working preliminary version of parser
Diffstat (limited to 'parser.c')
-rw-r--r--parser.c538
1 files changed, 430 insertions, 108 deletions
diff --git a/parser.c b/parser.c
index 96b9bf7..14fd47d 100644
--- a/parser.c
+++ b/parser.c
@@ -2,193 +2,515 @@
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
+#include <assert.h>
#include "memory.h"
+#include "opfuncs.h"
#include "parser.h"
+#define NOT_IMPLEMENTED false
+
+
+static bool ishexdigit(char c){
+ return (c>='0'&&c<='9')||(c>='a'&&c<='f')||(c>='A'&&c<='F');
+}
+
+static int hexnumber(char c){
+ return c<='9'?c-'0':(c&~('a'-'A'))-'A'+10;
+}
+
+static char hexencode(int n){
+ return n<10?n+'0':n+'a';
+}
+
+
typedef enum Tokentype{
TT_NUM,
TT_STR,
TT_WORD,
- TT_SYM
+ TT_OP,
+ TT_SYM, //all symbols that are not operators
+ TT_ENDSTMT,
+ TT_EOF,
+
+ TT_ERR=-1
} Tokentype;
typedef struct Token{
- const char *str;
+ Tokentype type;
+ const char *str; //Part of another string; not null-terminated, and do not free
int len;
} Token;
-Token nexttoken(const char **sourcep){
+
+static bool parsecomment(const char **sourcep){
+ const char *source=*sourcep;
+ if(*source!='#')return false;
+ if(source[1]=='#'&&source[2]=='#'){
+ source+=3;
+ while(*source&&
+ (*source!='#'||source[1]!='#'||source[2]!='#')){
+ source++;
+ }
+ if(!*source)return false; //unclosed block comment
+ source+=2;
+ } else {
+ while(*source&&*source!='\n')source++;
+ if(*source)source++;
+ }
+ *sourcep=source;
+ return true;
+}
+
+static void skipintermediate(const char **sourcep){
+ const char *source=*sourcep;
+ bool acted;
+ do {
+ acted=false;
+ while(isspace(*source)){
+ source++;
+ acted=true;
+ }
+ if(parsecomment(&source)){
+ acted=true;
+ }
+ } while(acted);
+ *sourcep=source;
+}
+
+static Token nexttoken(const char **sourcep){
+ skipintermediate(sourcep);
const char *source=*sourcep;
- while(isspace(*source))source++;
+ if(*source=='\0'){
+ Token tok={TT_EOF,NULL,-1};
+ return tok;
+ }
+ if(*source==';'){
+ Token tok={TT_ENDSTMT,source,1};
+ (*sourcep)++;
+ return tok;
+ }
if(isdigit(*source)||(*source=='-'&&isdigit(source[1]))){
char *endp;
strtod(source,&endp);
assert(endp!=source);
- Token tok={source,endp-source};
+ Token tok={TT_NUM,source,endp-source};
+ *sourcep=endp;
return tok;
}
-}
-
-
-int precedence(const char *op){
- switch(op[0]){
- case '!': return op[1]=='\0' ? 12 : op[1]=='='&&op[2]=='\0' ? 5 : -1;
- case '%': return op[1]=='\0' ? 11 : -1;
- case '&': return op[1]=='\0' ? 9 : op[1]=='&'&&op[2]=='\0' ? 4 : -1;
- case '*': return op[1]=='\0' ? 11 : op[1]=='*'&&op[2]=='\0' ? 14 : -1;
- case '+': return op[1]=='\0' ? 10 : -1;
- case '-': return op[1]=='\0' ? 10 : -1;
- case '/': return op[1]=='\0' ? 11 : op[1]=='/'&&op[2]=='\0' ? 11 : -1;
- case '<': return op[1]=='\0' ? 6 : op[1]=='='&&op[2]=='\0' ? 6 : -1;
- case '=': return op[1]=='\0' ? 1 : op[1]=='='&&op[2]=='\0' ? 5 : -1;
- case '>': return op[1]=='\0' ? 6 : op[1]=='='&&op[2]=='\0' ? 6 : -1;
- case '^': return op[1]=='\0' ? 8 : op[1]=='^'&&op[2]=='\0' ? 3 : -1;
- case '|': return op[1]=='\0' ? 7 : op[1]=='|'&&op[2]=='\0' ? 2 : -1;
- case '~': return op[1]=='\0' ? 12 : -1;
- default: return -1;
+ if(*source=='"'){
+ int i;
+ for(i=1;source[i]&&source[i]!='"';i++){
+ if(source[i]=='\\')i++;
+ }
+ if(!source[i]){
+ Token tok={TT_ERR,"Non-terminated string",21};
+ return tok;
+ }
+ *sourcep+=i+1;
+ Token tok={TT_STR,source,i+1};
+ return tok;
}
-}
-
-int associativity(const char *op){
- switch(op[0]){
- case '!': return op[1]=='\0' ? AS_PREFIX : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
- case '%': return op[1]=='\0' ? AS_LEFT : -1;
- case '&': return op[1]=='\0' ? AS_LEFT : op[1]=='&'&&op[2]=='\0' ? AS_LEFT : -1;
- case '*': return op[1]=='\0' ? AS_LEFT : op[1]=='*'&&op[2]=='\0' ? AS_RIGHT : -1;
- case '+': return op[1]=='\0' ? AS_LEFT : -1;
- case '-': return op[1]=='\0' ? AS_LEFT : -1;
- case '/': return op[1]=='\0' ? AS_LEFT : op[1]=='/'&&op[2]=='\0' ? AS_LEFT : -1;
- case '<': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
- case '=': return op[1]=='\0' ? AS_RIGHT : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
- case '>': return op[1]=='\0' ? AS_NONASSOC : op[1]=='='&&op[2]=='\0' ? AS_NONASSOC : -1;
- case '^': return op[1]=='\0' ? AS_LEFT : op[1]=='^'&&op[2]=='\0' ? AS_LEFT : -1;
- case '|': return op[1]=='\0' ? AS_LEFT : op[1]=='|'&&op[2]=='\0' ? AS_LEFT : -1;
- case '~': return op[1]=='\0' ? AS_PREFIX : -1;
- default: return -1;
+ int oplen=parseoplength(source);
+ if(oplen!=-1){
+ Token tok={TT_OP,source,oplen};
+ *sourcep+=oplen;
+ return tok;
+ }
+ if(strchr("(){}",*source)!=NULL){
+ Token tok={TT_SYM,source,1};
+ (*sourcep)++;
+ return tok;
+ }
+ if(isalpha(*source)||*source=='_'){
+ int i;
+ for(i=1;source[i];i++){
+ if(!isalpha(source[i])&&!isdigit(source[i])&&source[i]!='_')break;
+ }
+ Token tok={TT_WORD,source,i};
+ *sourcep+=i;
+ return tok;
}
+ Token tok={TT_ERR,"Unrecognised token",18};
+ return tok;
}
-static bool parsecomment(const char *source,int *reslen){
- int cursor=0;
- if(source[cursor]!='#')return false;
- if(source[cursor+1]=='#'&&source[cursor+2]=='#'){
- cursor+=3;
- while(source[cursor]&&
- (source[cursor]!='#'||source[cursor+1]!='#'||source[cursor+2]!='#')){
- cursor++;
- }
- if(!source[cursor])return false; //unclosed block comment
- cursor+=2;
+static void printtoken(FILE *stream,Token tok,const char *msg){
+ const char *type;
+ switch(tok.type){
+ case TT_NUM: type="TT_NUM"; break;
+ case TT_STR: type="TT_STR"; break;
+ case TT_WORD: type="TT_WORD"; break;
+ case TT_OP: type="TT_OP"; break;
+ case TT_SYM: type="TT_SYM"; break;
+ case TT_ENDSTMT: type="TT_ENDSTMT"; break;
+ case TT_EOF: type="TT_EOF"; break;
+ case TT_ERR: type="TT_ERR"; break;
+ default: type="TT_(??\?)"; break; //TRIGRAPHS ._.
+ }
+ if(tok.len!=-1){
+ char buf[tok.len+1];
+ memcpy(buf,tok.str,tok.len);
+ buf[tok.len]='\0';
+ fprintf(stream,"(%s) Token: %s '%s'\n",msg,type,buf);
} else {
- while(source[cursor]&&source[cursor]!='\n')cursor++;
- if(source[cursor])cursor++;
+ fprintf(stream,"(%s) Token: %s (null)\n",msg,type);
}
- *reslen=cursor;
- return true;
}
-static void parseintermediate(const char *source,int *reslen){
- int cursor=0;
- bool acted;
- do {
- acted=false;
- while(source[cursor]&&isspace(source[cursor])){
- cursor++;
- acted=true;
+
+static AST* parseterm(const char *source,int *reslen){
+ const char *origsource=source;
+ const Token tok=nexttoken(&source);
+ printtoken(stderr,tok,"parseterm");
+ AST *node;
+ switch(tok.type){
+ case TT_NUM:{
+ node=malloc(sizeof(AST));
+ if(!node)outofmem();
+ node->type=AST_NUM;
+ char *endp;
+ int intv=strtol(tok.str,&endp,0);
+ node->n.isint=endp-tok.str==tok.len;
+ if(node->n.isint)node->n.i=intv;
+ else node->n.d=strtod(tok.str,NULL);
+ break;
}
- int partlen;
- if(parsecomment(source+cursor,&partlen)){
- cursor+=partlen;
- acted=true;
+
+ case TT_STR:{
+ int slen=0;
+ for(int i=1;i<tok.len-1;i++){
+ slen++;
+ if(tok.str[i]!='\\')continue;
+ i++;
+ if(tok.str[i]=='x'){
+ if(i+2>=tok.len-1||!ishexdigit(tok.str[i+1])||!ishexdigit(tok.str[i+2])){
+ return NULL;
+ }
+ i+=2;
+ } else {
+ i++;
+ }
+ }
+ node=malloc(sizeof(AST));
+ if(!node)outofmem();
+ node->type=AST_STR;
+ node->s.str=malloc(slen+1);
+ if(!node->s.str)outofmem();
+ int j=0;
+ for(int i=1;i<tok.len-1;i++){
+ if(tok.str[i]!='\\'){
+ node->s.str[j++]=tok.str[i];
+ continue;
+ }
+ i++;
+ switch(tok.str[i]){
+ case 'n': node->s.str[j++]='\n'; break;
+ case 'r': node->s.str[j++]='\r'; break;
+ case 't': node->s.str[j++]='\t'; break;
+ case 'b': node->s.str[j++]='\b'; break;
+ case 'a': node->s.str[j++]='\a'; break;
+ case 'x':
+ node->s.str[j++]=16*hexnumber(tok.str[i+1])+hexnumber(tok.str[i+2]);
+ i+=2;
+ break;
+ default:
+ node->s.str[j++]=tok.str[i];
+ break;
+ }
+ }
+ node->s.str[j]='\0';
+ break;
}
- } while(acted);
- *reslen=cursor;
+
+ case TT_WORD:{
+ if(tok.len==2&&memcmp(tok.str,"if",2)==0)assert(NOT_IMPLEMENTED);
+ if(tok.len==5&&memcmp(tok.str,"while",2)==0)assert(NOT_IMPLEMENTED);
+ const char *tempsource=source;
+ Token next=nexttoken(&source);
+ if(next.len==1&&next.str[0]=='(')assert(NOT_IMPLEMENTED);
+ source=tempsource;
+ node=malloc(sizeof(AST));
+ if(!node)outofmem();
+ node->type=AST_VAR;
+ node->v.name=malloc(tok.len+1);
+ if(!node->v.name)outofmem();
+ memcpy(node->v.name,tok.str,tok.len);
+ node->v.name[tok.len]='\0';
+ break;
+ }
+
+ case TT_SYM:
+ assert(NOT_IMPLEMENTED);
+ break;
+
+ case TT_OP:{
+ char buf[tok.len+3];
+ buf[0]='(';
+ memcpy(buf+1,tok.str,tok.len);
+ buf[tok.len+1]=')';
+ buf[tok.len+2]='\0';
+ if(associativity(buf)==AS_PREFIX){
+ node=malloc(sizeof(AST));
+ if(!node)outofmem();
+ node->type=AST_OP;
+ node->o.op=opconststring_len(buf,tok.len+2);
+ node->o.left=NULL;
+ int len;
+ node->o.right=parseterm(source,&len);
+ if(!node->o.right){
+ free(node);
+ return NULL;
+ }
+ source+=len;
+ } else return NULL;
+ break;
+ }
+
+ case TT_ENDSTMT:
+ case TT_EOF:
+ case TT_ERR:
+ return NULL;
+ }
+ *reslen=source-origsource;
+ return node;
}
+//Uses precedence climbing
static AST* parseexpr(const char *source,int *reslen,int minprec){
- ;
+ const char *origsource=source;
+ int len;
+ AST *tree=parseterm(source,&len);
+ if(!tree)return NULL;
+ source+=len;
+ while(true){
+ const char *beforeop=source;
+ Token tok=nexttoken(&source);
+ printtoken(stderr,tok,"parseEXPR");
+ if(tok.type==TT_ENDSTMT){
+ source=beforeop;
+ break;
+ }
+ if(tok.type!=TT_OP){
+ ast_free(tree);
+ return NULL;
+ }
+ int prec=precedence_len(tok.str,tok.len);
+ if(prec<minprec){
+ source=beforeop;
+ break;
+ }
+ Associativity assoc=associativity_len(tok.str,tok.len);
+ int q;
+ switch(assoc){
+ case AS_PREFIX: case AS_SUFFIX:
+ ast_free(tree);
+ return NULL;
+
+ case AS_LEFT: q=prec+1; break;
+ case AS_RIGHT: q=prec; break;
+ case AS_NONASSOC: q=prec+1; minprec=prec+1; break;
+
+ default: assert(false);
+ }
+ AST *right=parseexpr(source,&len,q);
+ if(!right){
+ ast_free(tree);
+ return NULL;
+ }
+ source+=len;
+ AST *opnode=malloc(sizeof(AST));
+ if(!opnode)outofmem();
+ opnode->type=AST_OP;
+ opnode->o.op=opconststring_len(tok.str,tok.len);
+ if(!opnode->o.op)outofmem();
+ opnode->o.left=tree;
+ opnode->o.right=right;
+ tree=opnode;
+ }
+ *reslen=source-origsource;
+ return tree;
}
static AST* parsestmt(const char *source,int *reslen){
return parseexpr(source,reslen,0);
}
-ASTblock* parse(const char *source){
- ASTblock *bl=malloc(sizeof(ASTblock));
+AST* parse(const char *source){
+ AST *bl=malloc(sizeof(AST));
+ if(!bl)outofmem();
+ bl->type=AST_BLOCK;
int sz=32;
- bl->len=0;
- bl->exprs=calloc(sz,sizeof(AST*));
- if(!bl->exprs)outofmem();
+ bl->b.len=0;
+ bl->b.exprs=calloc(sz,sizeof(AST*));
+ if(!bl->b.exprs)outofmem();
int reslen;
int cursor=0;
while(true){
- if(bl->len==sz){
+ if(bl->b.len==sz){
sz*=2;
- bl->exprs=realloc(bl->exprs,sz*sizeof(AST*));
- if(!bl->exprs)outofmem();
+ bl->b.exprs=realloc(bl->b.exprs,sz*sizeof(AST*));
+ if(!bl->b.exprs)outofmem();
}
- parseintermediate(source+cursor,&reslen);
- if(!source[cursor])break;
AST *node=parsestmt(source+cursor,&reslen);
if(!node){
- ast_free((AST*)bl);
+ ast_free(bl);
return NULL;
}
- bl->exprs[bl->len++]=node;
+ bl->b.exprs[bl->b.len++]=node;
cursor+=reslen;
+ const char *src=source+cursor;
+ Token tok=nexttoken(&src);
+ if(tok.type!=TT_ENDSTMT){
+ ast_free(bl);
+ return NULL;
+ }
+ cursor=src-source;
+ src=source+cursor;
+ tok=nexttoken(&src);
+ if(tok.type==TT_EOF)break;
}
return bl;
}
-void ast_free(AST *ast_){
- switch(ast_->type){
- case AST_BLOCK:{ ASTblock *ast=(ASTblock*)ast_;
- for(int i=0;i<ast->len;i++)if(ast->exprs[i])ast_free(ast->exprs[i]);
- free(ast->exprs);
+static const char* charblock(char c,int n){
+ static char *buf=NULL;
+ if(!buf)buf=malloc(n+1);
+ else buf=realloc(buf,n+1);
+ if(!buf)outofmem();
+ memset(buf,c,n);
+ buf[n]='\0';
+ return buf;
+}
+
+#define TABW (4)
+#define INDENT fprintf(stream,"%s",charblock(' ',TABW*indent));
+static void ast_debug_(FILE *stream,const AST *ast,int indent){
+ switch(ast->type){
+ case AST_BLOCK:
+ if(ast->b.len==0){
+ fprintf(stream,"{}");
+ break;
+ }
+ fprintf(stream,"{\n");
+ indent++;
+ for(int i=0;i<ast->b.len;i++){
+ INDENT
+ ast_debug_(stream,ast->b.exprs[i],indent);
+ fputc('\n',stream);
+ }
+ indent--;
+ INDENT
+ fprintf(stream,"}");
+ break;
+
+ case AST_OP:{
+ bool leftp=ast->o.left&&ast->o.left->type==AST_OP&&precedence(ast->o.left->o.op)<=precedence(ast->o.op);
+ bool rightp=ast->o.right&&ast->o.right->type==AST_OP&&precedence(ast->o.right->o.op)<=precedence(ast->o.op);
+ //fprintf(stderr,"[[op='%s' p=%d lp=%d rp=%d]]",ast->o.op,precedence(ast->o.op),leftp,rightp);
+ if(leftp)fputc('(',stream);
+ if(ast->o.left)ast_debug_(stream,ast->o.left,indent);
+ fprintf(stream,"%s%s%s",leftp?")":"",ast->o.op,rightp?"(":"");
+ if(ast->o.right)ast_debug_(stream,ast->o.right,indent);
+ if(rightp)fputc(')',stream);
+ break;
+ }
+
+ case AST_NUM:
+ if(ast->n.isint)fprintf(stream,"%lld",ast->n.i);
+ else fprintf(stream,"%g",ast->n.d);
+ break;
+
+ case AST_STR:
+ fputc('"',stream);
+ for(int i=0;i<ast->s.len;i++){
+ if(ast->s.str[i]<32||ast->s.str[i]>126){
+ fprintf(stream,"\\x%c%c",hexencode(ast->s.str[i]/16),hexencode(ast->s.str[i]%16));
+ } else fputc(ast->s.str[i],stream);
+ }
+ fputc('"',stream);
+ break;
+
+ case AST_VAR:
+ fprintf(stream,"%s",ast->v.name);
+ break;
+
+ case AST_CALL:
+ fprintf(stream,"%s(",ast->c.func);
+ for(int i=0;i<ast->c.nargs;i++){
+ if(i!=0)fputc(',',stream);
+ ast_debug_(stream,ast->c.args[i],indent);
+ }
+ fputc(')',stream);
+ break;
+
+ case AST_IF:
+ assert(NOT_IMPLEMENTED);
+ break;
+
+ case AST_WHILE:
+ assert(NOT_IMPLEMENTED);
+ break;
+
+ default:
+ fprintf(stream,"AST_(??\?)");
+ break;
+ }
+}
+
+void ast_debug(FILE *stream,const AST *ast){
+ ast_debug_(stream,ast,0);
+ fputc('\n',stream);
+}
+
+void ast_free(AST *ast){
+ switch(ast->type){
+ case AST_BLOCK:{
+ for(int i=0;i<ast->b.len;i++)if(ast->b.exprs[i])ast_free(ast->b.exprs[i]);
+ free(ast->b.exprs);
break;
}
- case AST_OP:{ ASTop *ast=(ASTop*)ast_;
- if(ast->left)ast_free(ast->left);
- if(ast->right)ast_free(ast->right);
+ case AST_OP:{
+ if(ast->o.left)ast_free(ast->o.left);
+ if(ast->o.right)ast_free(ast->o.right);
break;
}
case AST_NUM:
break;
- case AST_STR:{ ASTstr *ast=(ASTstr*)ast_;
- if(ast->str)free(ast->str);
+ case AST_STR:{
+ if(ast->s.str)free(ast->s.str);
break;
}
- case AST_VAR:{ ASTvar *ast=(ASTvar*)ast_;
- if(ast->name)free(ast->name);
+ case AST_VAR:{
+ if(ast->v.name)free(ast->v.name);
break;
}
- case AST_CALL:{ ASTcall *ast=(ASTcall*)ast_;
- if(ast->func)free(ast->func);
- for(int i=0;i<ast->nargs;i++)if(ast->args[i])ast_free(ast->args[i]);
- free(ast->args);
+ case AST_CALL:{
+ if(ast->c.func)free(ast->c.func);
+ for(int i=0;i<ast->c.nargs;i++)if(ast->c.args[i])ast_free(ast->c.args[i]);
+ free(ast->c.args);
break;
}
- case AST_IF:{ ASTif *ast=(ASTif*)ast_;
- if(ast->cond)free(ast->cond);
- if(ast->thenb)free(ast->thenb);
- if(ast->elseb)free(ast->elseb);
+ case AST_IF:{
+ if(ast->i.cond)free(ast->i.cond);
+ if(ast->i.thenb)free(ast->i.thenb);
+ if(ast->i.elseb)free(ast->i.elseb);
break;
}
- case AST_WHILE:{ ASTwhile *ast=(ASTwhile*)ast_;
- if(ast->cond)free(ast->cond);
- if(ast->body)free(ast->body);
+ case AST_WHILE:{
+ if(ast->w.cond)free(ast->w.cond);
+ if(ast->w.body)free(ast->w.body);
break;
}
}
- free(ast_);
+ free(ast);
}