From 4b500bd4c69b481a611a61e72795c450120a6a7c Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sat, 6 Jul 2024 23:12:16 +0200 Subject: More stuff --- process-database.hs | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 process-database.hs (limited to 'process-database.hs') diff --git a/process-database.hs b/process-database.hs new file mode 100644 index 0000000..1fa9813 --- /dev/null +++ b/process-database.hs @@ -0,0 +1,155 @@ +#!/usr/bin/env cabal +{- cabal: +build-depends: base, containers, bytestring, bytestring-strict-builder +-} + +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE StrictData #-} +{-# LANGUAGE ViewPatterns #-} +{-# OPTIONS -Wall #-} +module Main where + +import qualified ByteString.StrictBuilder as BSB +import Data.Bits (shiftR) +import qualified Data.ByteString as BS +import Data.Foldable (toList) +import Data.Int (Int32) +import Data.List (sort) +import Data.Map.Strict (Map) +import qualified Data.Map.Strict as Map +import Data.Word (Word32) +import System.Environment (getArgs) +import System.Exit (die) + + +splitOn :: Eq a => a -> [a] -> [[a]] +splitOn sep l = case break (== sep) l of + (pre, _ : post) -> pre : splitOn sep post + (pre, []) -> [pre] + +blockBy :: Int -> [a] -> [[a]] +blockBy n l = case splitAt n l of + (pre, []) -> [pre] + (pre, post) -> pre : blockBy n post + +hasDuplicates :: Ord a => [a] -> Bool +hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l)) + +data Row i = Row + { rLexeme :: String + , rOrtho :: String + , rReading :: [i] + , rPOS :: String + , rFreq :: Int } + deriving (Show) + +readCSV :: String -> [Row Char] +readCSV = map (toRow . splitOn ',') . lines + where + toRow :: [String] -> Row Char + toRow [lx, ort, rea, pos, fre] = Row lx ort rea pos (read fre) + toRow row = error $ "Invalid row: " ++ show row + +-- (hira->kata, kata->ID, num IDs) +readTable :: String -> (Map Char Char, Map Char Int, Int) +readTable input = + let list = [(h, k) | [[h], [k]] <- map words (lines input)] + in (Map.fromList list + ,Map.fromList (zip (map snd list) [0..]) + ,length list) + +normalise :: Map Char Int -> Row Char -> [Row Int] +normalise any2id row = do + rea <- normReading (rReading row) + return row { rReading = rea } + where + normReading r = expandParens r >>= splitSlash >>= unKana + expandParens r + | (pre, '(' : r1) <- break (== '(') r + , (inner, ')' : post) <- break (== ')') r1 + = [pre ++ post, pre ++ inner ++ post] + | otherwise = [r] + splitSlash = splitOn '/' + unKana :: String -> [[Int]] + unKana "0" = [] + unKana "#N/A" = [] + unKana r = toList (traverse unKanaChar r) + unKanaChar :: Char -> Maybe Int + unKanaChar c = Map.lookup c any2id + +data Trie = Node [(String, Int)] -- ^ Words with this reading + [(Int, Trie)] -- ^ Sub-tries + deriving (Show) + +makeTrie :: Int -> [Row Int] -> Trie +makeTrie prefixlen rows = + let here = [row | row <- rows, length (rReading row) == prefixlen] + longer = Map.fromListWith (++) + [(rReading row !! prefixlen, [row]) + | row <- rows, length (rReading row) > prefixlen] + in Node [(rLexeme r, rFreq r) | r <- here] + (Map.assocs (makeTrie (prefixlen + 1) <$> longer)) + +-- Serialised format of the trie: +-- - Node = +-- - word8 NL +-- - NL * Lexeme: lexemes with the current prefix as reading +-- - word8 NS +-- - NS * Edge: longer words +-- - Lexeme = +-- - word8 LEN +-- - LEN * word8: the lexeme in UTF-8 +-- - word32LE: word frequency +-- - Edge = +-- - word8: the kana ID on this edge of the trie (add this to the end of the running prefix) +-- - word32LE: absolute byte offset pointing to another Node +-- +-- The root Node for the empty prefix is simply at the start of the file (byte offset 0). +serialiseTrie :: Int -> Int -> Trie -> BSB.Builder +serialiseTrie nIDs topoffset (Node lexemes subs) + | nIDs > 255 = error "Alphabet cannot be >255 in size with current database format" + | length lexemes > 255 = error $ "Too many lexemes for single reading: " ++ show lexemes + | otherwise = + let bwords = foldMap serialiseWord lexemes + in BSB.word8 (fromIntegral (length lexemes)) + <> bwords + <> serialiseSubs (topoffset + 1 + BSB.builderLength bwords) + where + serialiseWord (word, freq) = + let bword = foldMap BSB.utf8Char word + len = BSB.builderLength bword + in if | len > 255 -> error $ "Word too long: " ++ word + | freq < 0 || freq > fromIntegral (maxBound :: Int32) -> error $ "Frequency too large: " ++ show freq + | otherwise -> BSB.word8 (fromIntegral len) <> bword <> word32LE (fromIntegral freq) + + serialiseSubs subsoffset + | any (\c -> c < 0 || c >= nIDs) (map fst subs) || hasDuplicates (map fst subs) = + error $ "Invalid sub keys: " ++ show (map fst subs) + | otherwise = BSB.word8 (fromIntegral (length subs)) <> go subs mempty + where + restoffset = subsoffset + 1 + length subs * 5 + + go [] restBuilder = restBuilder + go ((key, trie) : subs') restBuilder = + let boffset = restoffset + BSB.builderLength restBuilder + b = serialiseTrie nIDs boffset trie + in BSB.word8 (fromIntegral key) <> word32LE (fromIntegral boffset) <> go subs' (restBuilder <> b) + + word32LE :: Word32 -> BSB.Builder + word32LE w = mconcat [BSB.word8 (fromIntegral (w `shiftR` (8 * i))) | i <- [0 .. 3]] + +main :: IO () +main = do + (csvfname, outfname) <- getArgs >>= \case + [csvfname, outfname] -> return (csvfname, outfname) + _ -> die "Usage: process-database " + + (hira2kata, kata2id, nIDs) <- readTable <$> readFile "table.txt" + let any2id = Map.map (kata2id Map.!) hira2kata <> kata2id + + rows <- concatMap (normalise any2id) . tail . readCSV <$> readFile csvfname + + let trie = makeTrie 0 rows + -- print trie + BS.writeFile outfname (BSB.builderBytes (serialiseTrie nIDs 0 trie)) -- cgit v1.2.3-70-g09d2