diff options
Diffstat (limited to 'parser.cpp')
-rw-r--r-- | parser.cpp | 339 |
1 files changed, 320 insertions, 19 deletions
@@ -1,5 +1,6 @@ #include <stdexcept> #include <stack> +#include <unordered_map> #include <cstring> #include <cctype> #include <cassert> @@ -8,10 +9,27 @@ using namespace std; +#define DEBUG cerr<<'['<<__FILE__<<':'<<__LINE__<<"] " + +template <typename T> +static ostream& operator<<(ostream &os,const vector<T> &v){ + os<<'{'; + bool first=true; + for(const T &t : v){ + if(!first)os<<", "; + else first=false; + os<<t; + } + return os<<'}'; +} + + ParseError::ParseError(const string &what_arg) :runtime_error(what_arg){} ParseError::ParseError(const char *what_arg) :runtime_error(what_arg){} +ParseError::ParseError(Site site,const string &what_arg) + :runtime_error(site.filename+":"+to_string(site.lnum)+":"+to_string(site.linex)+": "+what_arg){} static bool isinitwordchar(char c){ @@ -27,7 +45,7 @@ static const vector<string> tok_symbols={ ":=", "=", "+", "-", "*", "/", "%", "(", ")", ",", - "{", "}", "?{", "??{", "}&", + "{", "}", "?", "??", }; static bool isSymbolPrefix(const string &s){ @@ -108,6 +126,10 @@ public: return *this; } + Site site() const { + return Site(filename,lnum,linex); + } + /*void save(){ statestack.push({idx,nextidx,lnum,lineidx,ttype}); } @@ -131,7 +153,7 @@ public: } Token get(){ - return Token(ttype,get_(),Site(filename,lnum,linex)); + return Token(ttype,get_(),site()); } // Returns whether there are more tokens @@ -144,7 +166,7 @@ public: lnum++; linex=1; } else { - linex++; + linex+=1+3*(source[idx]=='\t'); } idx++; } @@ -158,7 +180,7 @@ public: ttype=Token::Type::terminator; return true; } - linex++; + linex+=1+3*(source[idx]=='\t'); idx++; } if(eof())return false; @@ -200,7 +222,7 @@ public: if(source[nextidx]=='.'){ nextidx++; if(eof(nextidx)||!isdigit(source[nextidx])){ - throw ParseError("Incomplete floating point literal at EOF"); + throw ParseError(site(),"Incomplete floating point literal at EOF"); } while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++; if(eof(nextidx))return true; @@ -208,11 +230,11 @@ public: if(strchr("eE",source[nextidx])!=NULL){ nextidx++; if(eof(nextidx)||strchr("+-0123456789",source[nextidx])==NULL){ - throw ParseError("Incomplete floating point literal at EOF"); + throw ParseError(site(),"Incomplete floating point literal at EOF"); } if(strchr("+-",source[nextidx])!=NULL){ nextidx++; - if(eof(nextidx))throw ParseError("Incomplete floating point literal at EOF"); + if(eof(nextidx))throw ParseError(site(),"Incomplete floating point literal at EOF"); } while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++; } @@ -225,8 +247,9 @@ public: nextidx++; while(!eof(nextidx)&&source[nextidx]!='"'){ if(source[nextidx]=='\\')nextidx++; + nextidx++; } - if(eof(nextidx))throw ParseError("Incomplete string literal at EOF"); + if(eof(nextidx))throw ParseError(site(),"Incomplete string literal at EOF"); nextidx++; return true; } @@ -236,23 +259,266 @@ public: ttype=Token::Type::symbol; nextidx++; while(!eof(nextidx)){ - if(!isSymbolPrefix(get_()))return true; + if(!isSymbolPrefix(get_())){ + nextidx--; + return true; + } nextidx++; } + nextidx--; if(contains(tok_symbols,get_()))return true; - else throw ParseError("Unknown symbol at EOF"); + else throw ParseError(site(),"Unknown symbol at EOF"); } - throw ParseError("Unknown token starting at '"+source.substr(idx,5)+"'"); + throw ParseError(site(),"Unknown token starting at '"+source.substr(idx,5)+"'"); } }; -static Expression parseExpression(Tokeniser &tokeniser){ - ; +enum class Associativity{ + left, + right, +}; + +struct OpInfo{ + string name; + int prec; //higher is tighter-binding + Associativity assoc; +}; + +unordered_map<string,OpInfo> optable={ + {"*", {"*", 6,Associativity::left}}, + {"/", {"/", 6,Associativity::left}}, + {"%", {"%", 6,Associativity::left}}, + + {"+", {"+", 5,Associativity::left}}, + {"-", {"-", 5,Associativity::left}}, + + {"==",{"==",3,Associativity::left}}, + {"!=",{"!=",3,Associativity::left}}, + {">", {">", 3,Associativity::left}}, + {"<", {"<", 3,Associativity::left}}, + {">=",{">=",3,Associativity::left}}, + {"<=",{"<=",3,Associativity::left}}, +}; + +static char unhexchar(char c){ + if(c>='0'&&c<='9')return c-'0'; + if(c>='a'&&c<='f')return c-'a'+10; + if(c>='A'&&c<='F')return c-'A'+10; + return (char)-1; +} + +static string parseString(const string &repr,Site site){ + if(repr.size()<2||repr[0]!='"'||repr.back()!='"')throw runtime_error("String not surrounded with quotes"); + string res; + res.reserve(repr.size()+3); + for(i64 i=1;i<(i64)repr.size()-1;i++){ + if(repr[i]=='\\'){ + switch(repr[i+1]){ + case 'n': res+='\n'; i++; break; + case 'r': res+='\r'; i++; break; + case 't': res+='\t'; i++; break; + case '"': res+='"'; i++; break; + case 'x':{ + if(i+3>=(i64)repr.size()-1)throw ParseError(site.addX(i),"Invalid hexadecimal escape"); + char c1=unhexchar(repr[i+2]); + char c2=unhexchar(repr[i+3]); + if(c1==(char)-1||c2==(char)-1)throw ParseError(site.addX(i),"Invalid hexadecimal escape"); + res+=(char)(16*c1+c2); + i+=3; + break; + } + default: + throw ParseError(site.addX(i),"Invalid hexadecimal escape"); + } + } else { + res+=repr[i]; + } + } + return res; +} + +static Expression parseExpression(Tokeniser &tokeniser,int minprec=-1); +static StatementList parseScope(Tokeniser &tokeniser); + +static vector<Expression> parseArgumentList(Tokeniser &tokeniser){ + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Expected argument list but found EOF"); + Token tok=tokeniser.get(); + if(tok.type!=Token::Type::symbol||tok.str!="("){ + throw ParseError(tok.site,"Expected argument list but found '"+tok.str+"'"); + } + tokeniser.advance(); + vector<Expression> args; + while(true){ + Expression expr=parseExpression(tokeniser); + if(tokeniser.eof()){ + throw ParseError(tokeniser.site(),"Expected ')' or ',' after argument but found EOF"); + } + tok=tokeniser.get(); + if(tok.type!=Token::Type::symbol||(tok.str!=")"&&tok.str!=",")){ + throw ParseError(tok.site,"Expected ')' or ',' after argument but found something else"); + } + tokeniser.advance(); + args.push_back(expr); + if(tok.str==")")break; + } + return args; +} + +static Expression parseAtom(Tokeniser &tokeniser){ + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Expected atom but found EOF"); + Token tok=tokeniser.get(); + switch(tok.type){ + case Token::Type::word:{ + tokeniser.advance(); + if(tokeniser.eof()){ + if(tok.str=="if")throw ParseError(tok.site,"Expected expressions after 'if' but found EOF"); + Expression expr=Expression(Expression::Type::call,tok.str,vector<Expression>()); + expr.site=tok.site; + return expr; + } + if(tok.str=="if"){ + Expression cond=parseExpression(tokeniser); + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Expected 'then' but found EOF"); + Token tok2=tokeniser.get(); + if(tok2.type!=Token::Type::word||tok2.str!="then"){ + throw ParseError(tok2.site,"Expected 'then' but got '"+tok2.str+"'"); + } + Expression ex1=parseExpression(tokeniser); + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Expected 'else' but found EOF"); + tok2=tokeniser.get(); + if(tok2.type!=Token::Type::word||tok2.str!="else"){ + throw ParseError(tok2.site,"Expected 'else' but got '"+tok2.str+"'"); + } + Expression ex2=parseExpression(tokeniser); + + return Expression(Expression::Type::cond,{cond,ex1,ex2}); + } + + Token tok2=tokeniser.get(); + if(tok2.type==Token::Type::symbol&&tok2.str=="("){ + vector<Expression> args=parseArgumentList(tokeniser); + bool done=false; + if(tokeniser.eof())done=true; + else { + tok2=tokeniser.get(); + if(tok2.type!=Token::Type::symbol||tok2.str!="{")done=true; + } + if(done){ + Expression expr=Expression(Expression::Type::call,tok.str,args); + expr.site=tok.site; + return expr; + } + return Expression(Expression::Type::dive,tok.str,args, + Scope(Scope::Type::direct,parseScope(tokeniser),{})); + } else if(tok2.type==Token::Type::symbol&&tok2.str=="{"){ + return Expression(Expression::Type::dive,tok.str,{}, + Scope(Scope::Type::direct,parseScope(tokeniser),{})); + } else { + Expression expr=Expression(Expression::Type::call,tok.str,vector<Expression>()); + expr.site=tok.site; + return expr; + } + } + + case Token::Type::number:{ + tokeniser.advance(); + Expression expr=Expression(Expression::Type::number,strtod(tok.str.data(),nullptr)); + expr.site=tok.site; + return expr; + } + + case Token::Type::string:{ + tokeniser.advance(); + Expression expr=Expression(Expression::Type::string,parseString(tok.str,tok.site)); + expr.site=tok.site; + return expr; + } + + case Token::Type::symbol:{ + if(tok.str=="("){ + tokeniser.advance(); + Expression expr=parseExpression(tokeniser); + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Expected ')' but found EOF"); + Token tok2=tokeniser.get(); + if(tok2.type!=Token::Type::symbol||tok2.str!=")"){ + throw ParseError(tok2.site,"Expected ')' but found something else"); + } + tokeniser.advance(); + return expr; + } + Scope::Type sctype; + if(tok.str=="?")sctype=Scope::Type::lazy; + else if(tok.str=="??")sctype=Scope::Type::function; + else if(tok.str!="{"){ + throw ParseError(tok.site,"Unexpected token '"+tok.str+"' in expression atom position"); + } else sctype=Scope::Type::direct; + vector<Expression> args; + if(sctype!=Scope::Type::direct){ + tokeniser.advance(); + if(tokeniser.eof()){ + throw ParseError(tokeniser.site(),"Expected scope after '"+tok.str+"' but found EOF"); + } + Token tok2=tokeniser.get(); + if(tok2.type!=Token::Type::symbol){ + throw ParseError(tok2.site,"Expected '(' or '{' after '"+tok.str+"'"); + } + if(tok2.type==Token::Type::symbol&&tok2.str=="("){ + args=parseArgumentList(tokeniser); + } + } + // DEBUG<<"args: "<<args<<endl; + // DEBUG<<"get(): "<<tokeniser.get().str<<endl; + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Expected '{' to begin scope"); + Token tok2=tokeniser.get(); + if(tok2.type!=Token::Type::symbol||tok2.str!="{"){ + throw ParseError(tok2.site,"Expected '{' to begin scope"); + } + Scope sc(sctype,parseScope(tokeniser),args); + return Expression(Expression::Type::scope,sc); + } + + case Token::Type::terminator: + throw ParseError(tok.site,"Expected expression atom but found statement terminator (newline or ';')"); + } +} + +static Expression parseExpression(Tokeniser &tokeniser,int minprec){ + Expression result=parseAtom(tokeniser); + while(!tokeniser.eof()){ + Token tok=tokeniser.get(); + if(tok.type==Token::Type::terminator|| + (tok.type==Token::Type::symbol&&(tok.str==","||tok.str==")"))|| + (tok.type==Token::Type::word&&(tok.str=="then"||tok.str=="else")))break; + if(tok.type!=Token::Type::symbol){ + throw ParseError(tok.site,"Expected operator in expression"); + } + auto it=optable.find(tok.str); + if(it==optable.end()){ + throw ParseError(tok.site,"Undefined operator '"+tok.str+"'"); + } + const OpInfo &op=it->second; + + if(op.prec<minprec)break; + tokeniser.advance(); + i64 nextminprec; + switch(op.assoc){ + case Associativity::left: nextminprec=op.prec+1; break; + case Associativity::right: nextminprec=op.prec; break; + } + Expression rhs=parseExpression(tokeniser,nextminprec); + Site oldsite=result.site; + result=Expression(Expression::Type::binop,op.name,{result,rhs}); + result.site=oldsite; + } + return result; } static Statement parseStatement(Tokeniser &tokeniser){ + if(tokeniser.eof()){ + throw ParseError(tokeniser.site(),"Expected statement but found EOF"); + } Token tok=tokeniser.get(); switch(tok.type){ case Token::Type::word:{ @@ -262,32 +528,67 @@ static Statement parseStatement(Tokeniser &tokeniser){ if(tok2.type==Token::Type::symbol&&tok2.str==":="){ tokeniser=copyiser; tokeniser.advance(); - return Statement(Statement::Type::create,tok.str,parseExpression(tokeniser)); + Statement st=Statement(Statement::Type::create,tok.str,parseExpression(tokeniser)); + st.site=tok.site; + return st; } else if(tok2.type==Token::Type::symbol&&tok2.str=="="){ tokeniser=copyiser; tokeniser.advance(); - return Statement(Statement::Type::assign,tok.str,parseExpression(tokeniser)); + Statement st=Statement(Statement::Type::assign,tok.str,parseExpression(tokeniser)); + st.site=tok.site; + return st; } else { - return Statement(Statement::Type::expression,parseExpression(tokeniser)); + Statement st=Statement(Statement::Type::expression,parseExpression(tokeniser)); + st.site=tok.site; + return st; } } case Token::Type::number: case Token::Type::string: - case Token::Type::symbol: - return Statement(Statement::Type::expression,parseExpression(tokeniser)); + case Token::Type::symbol:{ + Statement st=Statement(Statement::Type::expression,parseExpression(tokeniser)); + st.site=tok.site; + return st; + } case Token::Type::terminator: throw runtime_error("Unexpected terminator in parseStatement()"); } } +static StatementList parseScope(Tokeniser &tokeniser){ + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Expected scope"); + Token tok=tokeniser.get(); + if(tok.type!=Token::Type::symbol||tok.str!="{"){ + throw ParseError(tok.site,"Expected scope but found '"+tok.str+"'"); + } + if(!tokeniser.advance())throw ParseError(tok.site,"Incomplete scope at EOF"); + StatementList stl; + while(true){ + if(tokeniser.eof())throw ParseError(tokeniser.site(),"Incomplete scope at EOF"); + tok=tokeniser.get(); + if(tok.type==Token::Type::terminator){ + tokeniser.advance(); + continue; + } + if(tok.type==Token::Type::symbol&&tok.str=="}")break; + stl.push_back(parseStatement(tokeniser)); + } + tokeniser.advance(); + // DEBUG<<"leaving parseScope with tokeniser at "<<tokeniser.site()<<endl; + return stl; +} + StatementList parse(const string &source,const string &filename){ Tokeniser tokeniser(source,filename); if(!tokeniser.advance())return {}; StatementList stl; while(!tokeniser.eof()){ - if(tokeniser.get().type==Token::Type::terminator)continue; + if(tokeniser.get().type==Token::Type::terminator){ + tokeniser.advance(); + continue; + } stl.push_back(parseStatement(tokeniser)); } return stl; |