diff options
Diffstat (limited to 'parser.cpp')
-rw-r--r-- | parser.cpp | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/parser.cpp b/parser.cpp new file mode 100644 index 0000000..73ae368 --- /dev/null +++ b/parser.cpp @@ -0,0 +1,211 @@ +#include <stdexcept> +#include <cstring> +#include <cctype> +#include <cassert> +#include "parser.h" + +using namespace std; + + +ParseError::ParseError(const string &what_arg) + :runtime_error(what_arg){} +ParseError::ParseError(const char *what_arg) + :runtime_error(what_arg){} + + +static bool isinitwordchar(char c){ + return isalpha(c)||c=='_'; +} + +static bool iswordchar(char c){ + return isalpha(c)||isdigit(c)||c=='_'; +} + +static const vector<string> tok_symbols={ + "==", "!=", ">", "<", ">=", "<=", + ":=", "=", + "+", "-", "*", "/", "%", + "(", ")", ",", + "{", "}", "?{", "??{", "}&", +}; + +static bool isSymbolPrefix(const string &s){ + for(const string &sym : tok_symbols){ + if(s.size()<=sym.size()&&sym.substr(0,s.size())==s)return true; + } + return false; +} + +template <typename T> +static bool contains(const vector<T> &v,const T &target){ + for(const T &t : v){ + if(t==target)return true; + } + return false; +} + + +class Token{ +public: + enum class Type{ + word, + number, + string, + symbol, + }; + + Type type; + string str; + Site site; + + Token(Type type,const string &str,const Site &site) + :type(type),str(str),site(site){} +}; + + +class Tokeniser{ + const string &source; + const string &filename; + i64 idx,nextidx; + i64 lnum,linex; + Token::Type ttype; + + bool eof(i64 at){ + return at>=(i64)source.size(); + } + + string get_() const { + if(eof())throw runtime_error("Tokeniser::get() on eof"); + if(nextidx==-1)throw runtime_error("Tokeniser::get() before advance"); + if(nextidx==-2)throw runtime_error("Tokeniser::get() after eof"); + assert(nextidx>=0); + return source.substr(idx,nextidx-idx); + } + +public: + Tokeniser(const string &source,const string &filename) + :source(source),filename(filename), + idx(0),nextidx(-1), + lnum(1),linex(1){} + + bool eof() const { + return idx>=(i64)source.size(); + } + + Token get() const { + return Token(ttype,get_(),Site(filename,lnum,linex)); + } + + // Returns whether there are more tokens + bool advance(){ + if(eof())return false; + + while(idx<nextidx){ + if(source[idx]=='\n'){ + lnum++; + linex=1; + } else { + linex++; + } + idx++; + } + + while(true){ + i64 origidx=idx; + while(!eof()&&isspace(source[idx])){ + if(source[idx]=='\n'){ + lnum++; + linex=1; + } else { + linex++; + } + idx++; + } + if(eof())return false; + + if(source[idx]=='#'){ + while(!eof()&&source[idx]!='\n')idx++; + idx++; + lnum++; + linex=1; + if(eof())return false; + } + + if(idx==origidx)break; + } + + nextidx=idx; + + // Word + if(isinitwordchar(source[nextidx])){ + ttype=Token::Type::word; + do nextidx++; + while(!eof(nextidx)&&iswordchar(source[nextidx])); + return true; + } + + // Number literal + if(isdigit(source[nextidx])||(!eof(nextidx+1)&&source[nextidx]=='-'&&isdigit(source[nextidx+1]))){ + ttype=Token::Type::number; + if(source[nextidx]=='-')nextidx++; + while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++; + if(eof(nextidx))return true; + if(source[nextidx]=='.'){ + nextidx++; + if(eof(nextidx)||!isdigit(source[nextidx])){ + throw ParseError("Incomplete floating point literal at EOF"); + } + while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++; + if(eof(nextidx))return true; + } + if(strchr("eE",source[nextidx])!=NULL){ + nextidx++; + if(eof(nextidx)||strchr("+-0123456789",source[nextidx])==NULL){ + throw ParseError("Incomplete floating point literal at EOF"); + } + if(strchr("+-",source[nextidx])!=NULL){ + nextidx++; + if(eof(nextidx))throw ParseError("Incomplete floating point literal at EOF"); + } + while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++; + } + return true; + } + + // String literal + if(source[nextidx]=='"'){ + ttype=Token::Type::string; + nextidx++; + while(!eof(nextidx)&&source[nextidx]!='"'){ + if(source[nextidx]=='\\')nextidx++; + } + if(eof(nextidx))throw ParseError("Incomplete string literal at EOF"); + nextidx++; + return true; + } + + // Symbol + if(isSymbolPrefix({source[idx]})){ + ttype=Token::Type::symbol; + nextidx++; + while(!eof(nextidx)){ + if(!isSymbolPrefix(get_()))return true; + nextidx++; + } + if(contains(tok_symbols,get_()))return true; + else throw ParseError("Unknown symbol at EOF"); + } + + throw ParseError("Unknown token starting at '"+source.substr(idx,5)+"'"); + } +}; + +StatementList parse(const string &source,const string &filename){ + Tokeniser tokeniser(source,filename); + StatementList stl; + while(tokeniser.advance()){ + Token tok=tokeniser.get(); + + } + return stl; +} |