summaryrefslogtreecommitdiff
path: root/parser.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'parser.cpp')
-rw-r--r--parser.cpp211
1 files changed, 211 insertions, 0 deletions
diff --git a/parser.cpp b/parser.cpp
new file mode 100644
index 0000000..73ae368
--- /dev/null
+++ b/parser.cpp
@@ -0,0 +1,211 @@
+#include <stdexcept>
+#include <cstring>
+#include <cctype>
+#include <cassert>
+#include "parser.h"
+
+using namespace std;
+
+
+ParseError::ParseError(const string &what_arg)
+ :runtime_error(what_arg){}
+ParseError::ParseError(const char *what_arg)
+ :runtime_error(what_arg){}
+
+
+static bool isinitwordchar(char c){
+ return isalpha(c)||c=='_';
+}
+
+static bool iswordchar(char c){
+ return isalpha(c)||isdigit(c)||c=='_';
+}
+
+static const vector<string> tok_symbols={
+ "==", "!=", ">", "<", ">=", "<=",
+ ":=", "=",
+ "+", "-", "*", "/", "%",
+ "(", ")", ",",
+ "{", "}", "?{", "??{", "}&",
+};
+
+static bool isSymbolPrefix(const string &s){
+ for(const string &sym : tok_symbols){
+ if(s.size()<=sym.size()&&sym.substr(0,s.size())==s)return true;
+ }
+ return false;
+}
+
+template <typename T>
+static bool contains(const vector<T> &v,const T &target){
+ for(const T &t : v){
+ if(t==target)return true;
+ }
+ return false;
+}
+
+
+class Token{
+public:
+ enum class Type{
+ word,
+ number,
+ string,
+ symbol,
+ };
+
+ Type type;
+ string str;
+ Site site;
+
+ Token(Type type,const string &str,const Site &site)
+ :type(type),str(str),site(site){}
+};
+
+
+class Tokeniser{
+ const string &source;
+ const string &filename;
+ i64 idx,nextidx;
+ i64 lnum,linex;
+ Token::Type ttype;
+
+ bool eof(i64 at){
+ return at>=(i64)source.size();
+ }
+
+ string get_() const {
+ if(eof())throw runtime_error("Tokeniser::get() on eof");
+ if(nextidx==-1)throw runtime_error("Tokeniser::get() before advance");
+ if(nextidx==-2)throw runtime_error("Tokeniser::get() after eof");
+ assert(nextidx>=0);
+ return source.substr(idx,nextidx-idx);
+ }
+
+public:
+ Tokeniser(const string &source,const string &filename)
+ :source(source),filename(filename),
+ idx(0),nextidx(-1),
+ lnum(1),linex(1){}
+
+ bool eof() const {
+ return idx>=(i64)source.size();
+ }
+
+ Token get() const {
+ return Token(ttype,get_(),Site(filename,lnum,linex));
+ }
+
+ // Returns whether there are more tokens
+ bool advance(){
+ if(eof())return false;
+
+ while(idx<nextidx){
+ if(source[idx]=='\n'){
+ lnum++;
+ linex=1;
+ } else {
+ linex++;
+ }
+ idx++;
+ }
+
+ while(true){
+ i64 origidx=idx;
+ while(!eof()&&isspace(source[idx])){
+ if(source[idx]=='\n'){
+ lnum++;
+ linex=1;
+ } else {
+ linex++;
+ }
+ idx++;
+ }
+ if(eof())return false;
+
+ if(source[idx]=='#'){
+ while(!eof()&&source[idx]!='\n')idx++;
+ idx++;
+ lnum++;
+ linex=1;
+ if(eof())return false;
+ }
+
+ if(idx==origidx)break;
+ }
+
+ nextidx=idx;
+
+ // Word
+ if(isinitwordchar(source[nextidx])){
+ ttype=Token::Type::word;
+ do nextidx++;
+ while(!eof(nextidx)&&iswordchar(source[nextidx]));
+ return true;
+ }
+
+ // Number literal
+ if(isdigit(source[nextidx])||(!eof(nextidx+1)&&source[nextidx]=='-'&&isdigit(source[nextidx+1]))){
+ ttype=Token::Type::number;
+ if(source[nextidx]=='-')nextidx++;
+ while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++;
+ if(eof(nextidx))return true;
+ if(source[nextidx]=='.'){
+ nextidx++;
+ if(eof(nextidx)||!isdigit(source[nextidx])){
+ throw ParseError("Incomplete floating point literal at EOF");
+ }
+ while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++;
+ if(eof(nextidx))return true;
+ }
+ if(strchr("eE",source[nextidx])!=NULL){
+ nextidx++;
+ if(eof(nextidx)||strchr("+-0123456789",source[nextidx])==NULL){
+ throw ParseError("Incomplete floating point literal at EOF");
+ }
+ if(strchr("+-",source[nextidx])!=NULL){
+ nextidx++;
+ if(eof(nextidx))throw ParseError("Incomplete floating point literal at EOF");
+ }
+ while(!eof(nextidx)&&isdigit(source[nextidx]))nextidx++;
+ }
+ return true;
+ }
+
+ // String literal
+ if(source[nextidx]=='"'){
+ ttype=Token::Type::string;
+ nextidx++;
+ while(!eof(nextidx)&&source[nextidx]!='"'){
+ if(source[nextidx]=='\\')nextidx++;
+ }
+ if(eof(nextidx))throw ParseError("Incomplete string literal at EOF");
+ nextidx++;
+ return true;
+ }
+
+ // Symbol
+ if(isSymbolPrefix({source[idx]})){
+ ttype=Token::Type::symbol;
+ nextidx++;
+ while(!eof(nextidx)){
+ if(!isSymbolPrefix(get_()))return true;
+ nextidx++;
+ }
+ if(contains(tok_symbols,get_()))return true;
+ else throw ParseError("Unknown symbol at EOF");
+ }
+
+ throw ParseError("Unknown token starting at '"+source.substr(idx,5)+"'");
+ }
+};
+
+StatementList parse(const string &source,const string &filename){
+ Tokeniser tokeniser(source,filename);
+ StatementList stl;
+ while(tokeniser.advance()){
+ Token tok=tokeniser.get();
+
+ }
+ return stl;
+}