diff options
author | Tom Smeding <tom@tomsmeding.com> | 2023-04-23 16:15:21 +0200 |
---|---|---|
committer | Tom Smeding <tom@tomsmeding.com> | 2023-04-23 16:15:21 +0200 |
commit | f43838280ce37c8ce67eda2d36f4021439f0a84c (patch) | |
tree | 312d18a829f7fe311ea2a13463600245b2a2d53a /Parser.hs | |
parent | b036b1cac1377cdbb9cc57ae6124cd6d6e5775a9 (diff) |
Untested minimal viable parser
Diffstat (limited to 'Parser.hs')
-rw-r--r-- | Parser.hs | 377 |
1 files changed, 287 insertions, 90 deletions
@@ -2,7 +2,17 @@ {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE TupleSections #-} -module Parser where +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE RankNTypes #-} +module Parser ( + parse, + printErrMsg, + -- * Re-exports + These(..), +) where import Control.Applicative import Control.Monad.Chronicle @@ -119,6 +129,10 @@ pProgram = do pFunDef :: Parser (FunDef ()) pFunDef = do skipWhiteComment + pFunDef0 + +pFunDef0 :: Parser (FunDef ()) +pFunDef0 = do mtypesig <- optional pStandaloneTypesig0 let mname = fst <$> mtypesig mtype = snd <$> mtypesig @@ -137,21 +151,22 @@ pFunEq mCheckName = do skipWhiteComment assertAtBlockLeft Fatal "Expected function clause, found indented stuff" - name <- pIdentifier0 Lowercase + name <- pIdentifier0 AtLeft Lowercase case mCheckName of Just checkName | name /= checkName -> raise Fatal "Name of function clause does not correspond with type signature" _ -> return () pats <- many (pPattern 11) - rhs <- pRHS + rhs <- pRHS "=" return (FunEq name pats rhs) -pRHS :: Parser (RHS ()) -pRHS = do +-- | Pass "=" for function definitions and "->" for case clauses. +pRHS :: String -> Parser (RHS ()) +pRHS sepsym = do -- TODO: parse guards inlineWhite - string "=" + pKeySym sepsym Plain <$> pExpr pPattern :: Int -> Parser (Pattern ()) @@ -165,43 +180,43 @@ pPattern0 d = do ,pPatList0 ,pPatParens0] where - pPatWildcard0 = string "_" >> return (PWildcard ()) + pPatWildcard0 = pKeySym "_" >> return (PWildcard ()) pPatVarOrAs0 = do - var <- pIdentifier0 Lowercase + var <- pIdentifier0 InBlock Lowercase asum [do inlineWhite - string "@" + pKeySym "@" p <- pPattern 11 return (PAs () var p) ,return (PVar () var)] pPatCon0 = do - con <- pIdentifier0 Uppercase + con <- pIdentifier0 InBlock Uppercase if d > 0 then return (PCon () con []) else do args <- many (pPattern 11) return (PCon () con args) pPatList0 = do - string "[" - ps <- pPattern 0 `sepBy` (inlineWhite >> string ",") + char '[' -- special syntax, no need for pKeySym + ps <- pPattern 0 `sepBy` (inlineWhite >> char ',') inlineWhite - string "]" + char ']' return (PList () ps) pPatParens0 = do - string "(" + char '(' inlineWhite - asum [do string ")" + asum [do char ')' return (PTup () []) ,do p <- pPattern0 0 inlineWhite - asum [do string ")" + asum [do char ')' return p - ,do string "," - ps <- pPattern 0 `sepBy1` (inlineWhite >> string ",") + ,do char ',' + ps <- pPattern 0 `sepBy1` (inlineWhite >> char ',') return (PTup () (p : ps))]] pExpr :: Parser (Expr ()) pExpr = do inlineWhite - -- basics: lit, list, tup + -- basics: lit, list, var, con, tup -- expression atom: application of basics -- expression parser: op -- around: let, case, if @@ -210,6 +225,60 @@ pExpr = do ,pEIf0 ,pExprOpExpr0 0] +pELet0 :: Parser (Expr ()) +pELet0 = do + pKeyword "let" + inlineWhite + startLayoutBlock $ do + -- The first occurrence is also going to skip whitespace in front, + -- which is redundant -- but not harmful. + defs <- many $ do + skipWhiteComment + -- Note: now not necessarily in the indented block. Which is + -- precisely what we need here. If we see "in", let the 'many' + -- choice fail so that the defs loop ends. But let it fail outside + -- this asum so that it is the many that picks it up, not this + -- asum. + res <- asum [Nothing <$ lookAhead (pKeyword "in") + ,Just <$> pFunDef0] + case res of + Nothing -> empty + Just def -> return def + inlineWhite + body <- pExpr + return (ELet () defs body) + +pECase0 :: Parser (Expr ()) +pECase0 = do + pKeyword "case" + e <- pExpr + inlineWhite + pKeyword "of" + inlineWhite + startLayoutBlock $ do + -- The first clause is going to skip whitespace, but that's harmless + -- (though redundant). + let pClause = do + skipWhiteComment + whenM (not <$> isInsideBlock) (() <$ empty) + pat <- pPattern0 0 + rhs <- pRHS "->" + return (pat, rhs) + clauses <- many pClause + return (ECase () e clauses) + +pEIf0 :: Parser (Expr ()) +pEIf0 = do + pKeyword "if" + e1 <- pExpr + inlineWhite + pKeyword "then" + e2 <- pExpr + inlineWhite + pKeyword "else" + e3 <- pExpr + return (EIf () e1 e2 e3) + pExprOpExpr :: Int -> Parser (Expr ()) pExprOpExpr d = inlineWhite >> pExprOpExpr0 d @@ -229,7 +298,8 @@ pExprOpExpr0 d = do return () let oprhsd = case a2 of AssocRight -> d2 ; _ -> d2 + 1 rhs <- pExprOpExpr oprhsd - climbRight (EOp () lhs op rhs) (Just paop)] + climbRight (EOp () lhs op rhs) (Just paop) + ,return lhs] pEApp0 :: Parser (Expr ()) pEApp0 = do @@ -240,7 +310,79 @@ pEApp0 = do _ -> return (EApp () e1 es) pEAtom0 :: Parser (Expr ()) -pEAtom0 = pELit <|> pEList <|> pEParens +pEAtom0 = (ELit () <$> pLiteral0) <|> pEList0 <|> pEVar0 <|> pECon0 <|> pEParens0 + +pLiteral0 :: Parser Literal +pLiteral0 = asum + [do as <- some (satisfy isDigit) + let a = read as :: Integer + asum + [do char '.' + bs <- some (satisfy isDigit) + let b = read bs :: Integer + cs <- optional $ do + char 'e' + cs <- some (satisfy isDigit) + return cs + let c = maybe 0 read cs :: Integer + return (LFloat ((fromIntegral a + fromIntegral b / 10 ^ length bs) * 10 ^ c)) + ,return (LInt a)] + ,do char '\'' + c <- pStringChar + char '\'' + return (LChar c) + ,do char '"' + s <- many pStringChar + char '"' + return (LString s)] + +pStringChar :: Parser Char +pStringChar = asum + [do char '\\' + char 'x' + let hexdig = do + c <- satisfy $ \c' -> + let c = toLower c' + in 'a' <= c && c <= 'f' || '0' <= c && c <= '9' + return $ if 'a' <= c then 10 + ord c - ord 'a' + else ord c - ord '0' + digs <- some hexdig + return (chr (sum (zipWith (*) (reverse digs) (iterate (*16) 1)))) + ,do char '\\' + satisfy (const True) >>= \case + 'n' -> return '\n' + 'r' -> return '\r' + 't' -> return '\t' + 'a' -> return '\a' + 'b' -> return '\b' + '\'' -> return '\'' + '\"' -> return '\"' + '0' -> return '\0' + c -> do raise Error $ "Invalid escape sequence: \\" ++ [c] + return '?' + ,do satisfy (\c -> c `notElem` "\n\r\\\'")] + +pEList0 :: Parser (Expr ()) +pEList0 = do + char '[' -- special syntax, no need for pKeySym + es <- sepBy pExpr (inlineWhite >> char ',') + inlineWhite + char ']' + return (EList () es) + +pEVar0 :: Parser (Expr ()) +pEVar0 = EVar () <$> pIdentifier0 InBlock Lowercase + +pECon0 :: Parser (Expr ()) +pECon0 = ECon () <$> pIdentifier0 InBlock Uppercase + +pEParens0 :: Parser (Expr ()) +pEParens0 = do + char '(' + e <- pExpr + inlineWhite + char ')' + return e data Associativity = AssocLeft | AssocRight | AssocNone deriving (Show, Eq) @@ -251,21 +393,21 @@ data ParsedOperator = PaOp Operator Int Associativity pInfixOp :: Parser ParsedOperator pInfixOp = do inlineWhite - asum [PaOp OEqu 4 AssocNone <$ string "==" - ,PaOp OAdd 6 AssocLeft <$ string "+" - ,PaOp OSub 6 AssocLeft <$ string "-" - ,PaOp OMul 7 AssocLeft <$ string "*" - ,PaOp ODiv 7 AssocLeft <$ string "/" - ,PaOp OMod 7 AssocLeft <$ string "%" - ,PaOp OPow 8 AssocRight <$ string "^" + asum [PaOp OEqu 4 AssocNone <$ pKeySym "==" + ,PaOp OAdd 6 AssocLeft <$ pKeySym "+" + ,PaOp OSub 6 AssocLeft <$ pKeySym "-" + ,PaOp OMul 7 AssocLeft <$ pKeySym "*" + ,PaOp ODiv 7 AssocLeft <$ pKeySym "/" + ,PaOp OMod 7 AssocLeft <$ pKeySym "%" + ,PaOp OPow 8 AssocRight <$ pKeySym "^" ] pStandaloneTypesig0 :: Parser (Name, Type) pStandaloneTypesig0 = do assertAtBlockLeft Fatal "Expected top-level definition, found indented stuff" - name@(Name namestr) <- pIdentifier0 Lowercase + name@(Name namestr) <- pIdentifier0 AtLeft Lowercase inlineWhite - string "::" + pKeySym "::" pushContext ("type signature for '" ++ namestr ++ "'") $ do ty <- pType return (name, ty) @@ -274,7 +416,7 @@ pType :: Parser Type pType = do ty1 <- pTypeApp asum [do inlineWhite - string "->" + pKeySym "->" ty2 <- pType return (TFun ty1 ty2) ,return ty1] @@ -290,30 +432,42 @@ pTypeAtom = pTypeParens <|> pTypeList <|> pTypeCon <|> pTypeVar where pTypeParens = do inlineWhite - string "(" + char '(' asum [do inlineWhite - string ")" + char ')' return (TTup []) ,do ty1 <- pType ty2s <- many $ do inlineWhite - string "," + char ',' pType inlineWhite - string ")" + char ')' case ty2s of [] -> return ty1 _ -> return (TTup (ty1 : ty2s))] pTypeList = do inlineWhite - string "[" + char '[' ty <- pType - string "]" + char ']' return (TList ty) - pTypeCon = inlineWhite >> TCon <$> pIdentifier0 Uppercase - pTypeVar = inlineWhite >> TVar <$> pIdentifier0 Lowercase + pTypeCon = inlineWhite >> TCon <$> pIdentifier0 InBlock Uppercase + pTypeVar = inlineWhite >> TVar <$> pIdentifier0 InBlock Lowercase + +-- | Parse the given name-like keyword, ensuring that it is the entire word. +pKeyword :: String -> Parser () +pKeyword s = do + string s + notFollowedBy (() <$ satisfy isNameContChar) + +-- | Parse the given symbol-like keyword, ensuring that it is the entire symbol. +pKeySym :: String -> Parser () +pKeySym s = do + string s + notFollowedBy (() <$ satisfy isSymbolChar) data Case = Uppercase | Lowercase deriving (Show) @@ -321,20 +475,32 @@ data Case = Uppercase | Lowercase -- | Consumes an identifier (word or parenthesised operator) at the current -- position. The `var` production in Haskell2010. -- var -> varid | "(" varsym ")" -pIdentifier0 :: Case -> Parser Name -pIdentifier0 cs = pAlphaName0 cs <|> pParens0 (pSymbol0 cs) +pIdentifier0 :: BlockPos -> Case -> Parser Name +pIdentifier0 bpos cs = pAlphaName0 bpos cs <|> pParens0 (pSymbol0 bpos cs) + where + -- | Parser between parens, with the opening paren at the current position. + pParens0 :: Parser a -> Parser a + pParens0 p = do + char '(' + inlineWhite + res <- p + inlineWhite + char ')' + return res -- | Consumes a word-like name at the current position with the given case. The -- `varid` production in Haskell2010 for 'Lowercase', `conid' for 'Uppercase'. -- -- > varid -> (small {small | large | digit | "'"}) without reservedid -pAlphaName0 :: Case -> Parser Name -pAlphaName0 cs = do - (_, s) <- readInline (\case True -> \case Just c | isInitNameChar c -> Just (Right False) - _ -> Nothing - False -> \case Just c | isNameContChar c -> Just (Right False) - _ -> Just (Left ())) - True +pAlphaName0 :: BlockPos -> Case -> Parser Name +pAlphaName0 bpos cs = do + (_, s) <- readToken bpos + (\atfst mc -> case (atfst, mc) of + (True , Just c) | isNameHeadChar c -> Just (Right False) + (True , _ ) -> Nothing + (False, Just c) | isNameContChar c -> Just (Right False) + (False, _ ) -> Just (Left ())) + True name <- case cs of Uppercase | isLower (head s) -> do raise Error "Unexpected uppercase word at this position, assuming typo" @@ -348,10 +514,12 @@ pAlphaName0 cs = do ,"infixr", "instance", "let", "module", "newtype", "of" ,"then", "type", "where", "_"]) return (Name name) - where - isInitNameChar, isNameContChar :: Char -> Bool - isInitNameChar c = isLetter c || c == '_' - isNameContChar c = isInitNameChar c || isDigit c || c == '\'' + +isNameHeadChar :: Char -> Bool +isNameHeadChar c = isLetter c || c == '_' + +isNameContChar :: Char -> Bool +isNameContChar c = isNameHeadChar c || isDigit c || c == '\'' -- | Consumes a symbol at the current position. The `varsym` production in -- Haskell2010 for 'Lowercase', `consym` otherwise. @@ -364,12 +532,11 @@ pAlphaName0 cs = do -- > dashes -> "--" {"-"} -- > special -> ```(),;[]`{}``` -- > reservedop -> ".." | ":" | "::" | "=" | "\" | "|" | "<-" | "->" | "@" | "~" | "=>" -pSymbol0 :: Case -> Parser Name -pSymbol0 cs = do - let isSpecialExt c = c `elem` "(),;[]`{}_\"'" - isAscSymbol c = c `elem` "!#$%&⋆+./<=>?@^|-~:" - isUniSymbol c = isSymbol c || isPunctuation c - isSymbolChar c = (isAscSymbol c || isUniSymbol c) && not (isSpecialExt c) +pSymbol0 :: BlockPos -> Case -> Parser Name +pSymbol0 bpos cs = do + case bpos of + AtLeft -> assertAtBlockLeft Fatal "Expected symbol, but found indented expression" + InBlock -> assertInsideBlock Fatal "Expected symbol, but found end of indented expression" name <- (:) <$> (case cs of Lowercase -> satisfy (\c -> isSymbolChar c && c /= ':') Uppercase -> satisfy (== ':')) <*> many (satisfy isSymbolChar) @@ -377,19 +544,12 @@ pSymbol0 cs = do guard (take 2 name /= "--") return (Name name) --- | Parser between parens, with the opening paren at the current position. --- Enforces that all components are within the current indented block. -pParens0 :: Parser a -> Parser a -pParens0 p = do - string "(" - skipWhiteComment - assertInsideBlock Error "Unexpected dedent after opening parenthesis" - res <- p - assertInsideBlock Error "Unexpected dedent in parenthesised expression" - skipWhiteComment - assertInsideBlock Error "Unexpected dedent in parenthesised expression" - string ")" - return res +isSymbolChar :: Char -> Bool +isSymbolChar c = (isAscSymbol || isUniSymbol) && not isSpecialExt + where + isSpecialExt = c `elem` "(),;[]`{}_\"'" + isAscSymbol = c `elem` "!#$%&⋆+./<=>?@^|-~:" + isUniSymbol = isSymbol c || isPunctuation c sepBy1 :: Parser a -> Parser sep -> Parser [a] @@ -412,11 +572,17 @@ startLayoutBlock p = do , psBlkCol = psBlkCol ps0 }) return res -data Fatality = Error | Fatal - deriving (Show) +data Fatality fatal where + Error :: Fatality 'False + Fatal :: Fatality 'True +deriving instance Show (Fatality fatal) + +type family FatalCtx fatal a where + FatalCtx 'False a = a ~ () + FatalCtx 'True a = () -- | Raise an error with the given fatality and description. -raise :: Fatality -> String -> Parser () +raise :: FatalCtx fatal a => Fatality fatal -> String -> Parser a raise fat msg = do Context { ctxFile = fp , ctxStack = stk } <- ask PS { psLine = line, psCol = col } <- get @@ -425,18 +591,27 @@ raise fat msg = do Fatal -> confess . pure fun (ErrMsg fp stk line col msg) +raise' :: Fatality fatal -> String -> Parser () +raise' Error = raise Error +raise' Fatal = raise Fatal + -- | Registers a scope description on the stack for error reporting. pushContext :: String -> Parser a -> Parser a pushContext descr = local (\c -> c { ctxStack = descr : ctxStack c }) --- | Consumes an inline token at the current position, asserting that we are --- within the current block at the start. The token is defined by a pure --- stateful parser. If encountering a newline or EOF, the parser is run on this --- character ('Nothing' for EOF); if this produces a result, the result is --- returned; otherwise, the parser fails. The newline is not consumed. -readInline :: (s -> Maybe Char -> Maybe (Either r s)) -> s -> Parser (r, String) -readInline f s0 = do - assertInsideBlock Fatal "Expected stuff, but found end of indented expression" +data BlockPos = AtLeft | InBlock + deriving (Show) + +-- | Consumes a token at the current position, asserting that we are +-- in the position indicated by the 'BlockPos' argument. The token is defined +-- by a pure stateful parser. If encountering a newline or EOF, the parser is +-- run on this character ('Nothing' for EOF); if this produces a result, the +-- result is returned; otherwise, the parser fails. The newline is not consumed. +readToken :: BlockPos -> (s -> Maybe Char -> Maybe (Either r s)) -> s -> Parser (r, String) +readToken bpos f s0 = do + case bpos of + AtLeft -> assertAtBlockLeft Fatal "Expected token, but found indented expression" + InBlock -> assertInsideBlock Fatal "Expected token, but found end of indented expression" let loop :: (s -> Maybe Char -> Maybe (Either r s)) -> s -> Parser (r, String) loop f' st = do ps <- get @@ -476,7 +651,7 @@ inlineSpaces = readWhileInline isSpace -- end outside the current block. blockComment :: Parser () blockComment = do - string "{-" + string "{-" -- no need for pKeySym here let loop = do readWhileInline (`notElem` "{-") -- "-}" also starts with '-' asum [string "-}" @@ -488,21 +663,24 @@ blockComment = do -- | Consumes a line comment marker and the rest of the line, excluding -- newline. lineComment :: Parser () -lineComment = string "--" >> readWhileInline (const True) +lineComment = do + -- '--!' is an operator, so we need to parse a whole symbol here. + pKeySym "--" + readWhileInline (const True) -- | Raises an error if we're not currently at the given column. -assertAtBlockLeft :: Fatality -> String -> Parser () -assertAtBlockLeft fat msg = whenM (not <$> isAtBlockLeft) $ raise fat msg +assertAtBlockLeft :: Fatality fatal -> String -> Parser () +assertAtBlockLeft fat msg = whenM (not <$> isAtBlockLeft) $ raise' fat msg -- | Raises an error if psCol is not greater than psRefCol. -assertInsideBlock :: Fatality -> String -> Parser () -assertInsideBlock fat msg = whenM (not <$> isInsideBlock) $ raise fat msg +assertInsideBlock :: Fatality fatal -> String -> Parser () +assertInsideBlock fat msg = whenM (not <$> isInsideBlock) $ raise' fat msg -- | Raises an error if we're not currently at EOF. -assertEOF :: Fatality -> Parser () +assertEOF :: Fatality fatal -> Parser () assertEOF fat = gets psRest >>= \case [] -> return () - _ -> raise fat "Unexpected stuff" + _ -> raise' fat "Unexpected stuff" -- | Returns whether the current position is _within_ the current block, for -- soft-wrapping content. This means that col > blkCol. @@ -551,6 +729,11 @@ satisfy p = do return c _ -> empty +-- | Consumes exactly this character at the current position. Must not be a +-- newline. +char :: Char -> Parser () +char c = string [c] + -- | Consumes exactly this string at the current position. The string must not -- contain a newline. string :: String -> Parser () @@ -562,6 +745,20 @@ string s = do , psRest = drop (length s) (psRest ps) }) else empty +lookAhead :: Parser () -> Parser () +lookAhead p = do + ps <- get + success <- absolve False (True <$ p) + put ps -- restore state, as if nothing happened + when (not success) empty + +notFollowedBy :: Parser () -> Parser () +notFollowedBy p = do + ps <- get + success <- absolve True (False <$ p) + put ps -- restore state, as if nothing happened + when (not success) empty + -- | Only succeeds at EOF. eof :: Parser () eof = gets psRest >>= \case [] -> return () |