aboutsummaryrefslogtreecommitdiff
path: root/process-database.hs
diff options
context:
space:
mode:
Diffstat (limited to 'process-database.hs')
-rw-r--r--process-database.hs155
1 files changed, 155 insertions, 0 deletions
diff --git a/process-database.hs b/process-database.hs
new file mode 100644
index 0000000..1fa9813
--- /dev/null
+++ b/process-database.hs
@@ -0,0 +1,155 @@
+#!/usr/bin/env cabal
+{- cabal:
+build-depends: base, containers, bytestring, bytestring-strict-builder
+-}
+
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE MultiWayIf #-}
+{-# LANGUAGE StrictData #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# OPTIONS -Wall #-}
+module Main where
+
+import qualified ByteString.StrictBuilder as BSB
+import Data.Bits (shiftR)
+import qualified Data.ByteString as BS
+import Data.Foldable (toList)
+import Data.Int (Int32)
+import Data.List (sort)
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as Map
+import Data.Word (Word32)
+import System.Environment (getArgs)
+import System.Exit (die)
+
+
+splitOn :: Eq a => a -> [a] -> [[a]]
+splitOn sep l = case break (== sep) l of
+ (pre, _ : post) -> pre : splitOn sep post
+ (pre, []) -> [pre]
+
+blockBy :: Int -> [a] -> [[a]]
+blockBy n l = case splitAt n l of
+ (pre, []) -> [pre]
+ (pre, post) -> pre : blockBy n post
+
+hasDuplicates :: Ord a => [a] -> Bool
+hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l))
+
+data Row i = Row
+ { rLexeme :: String
+ , rOrtho :: String
+ , rReading :: [i]
+ , rPOS :: String
+ , rFreq :: Int }
+ deriving (Show)
+
+readCSV :: String -> [Row Char]
+readCSV = map (toRow . splitOn ',') . lines
+ where
+ toRow :: [String] -> Row Char
+ toRow [lx, ort, rea, pos, fre] = Row lx ort rea pos (read fre)
+ toRow row = error $ "Invalid row: " ++ show row
+
+-- (hira->kata, kata->ID, num IDs)
+readTable :: String -> (Map Char Char, Map Char Int, Int)
+readTable input =
+ let list = [(h, k) | [[h], [k]] <- map words (lines input)]
+ in (Map.fromList list
+ ,Map.fromList (zip (map snd list) [0..])
+ ,length list)
+
+normalise :: Map Char Int -> Row Char -> [Row Int]
+normalise any2id row = do
+ rea <- normReading (rReading row)
+ return row { rReading = rea }
+ where
+ normReading r = expandParens r >>= splitSlash >>= unKana
+ expandParens r
+ | (pre, '(' : r1) <- break (== '(') r
+ , (inner, ')' : post) <- break (== ')') r1
+ = [pre ++ post, pre ++ inner ++ post]
+ | otherwise = [r]
+ splitSlash = splitOn '/'
+ unKana :: String -> [[Int]]
+ unKana "0" = []
+ unKana "#N/A" = []
+ unKana r = toList (traverse unKanaChar r)
+ unKanaChar :: Char -> Maybe Int
+ unKanaChar c = Map.lookup c any2id
+
+data Trie = Node [(String, Int)] -- ^ Words with this reading
+ [(Int, Trie)] -- ^ Sub-tries
+ deriving (Show)
+
+makeTrie :: Int -> [Row Int] -> Trie
+makeTrie prefixlen rows =
+ let here = [row | row <- rows, length (rReading row) == prefixlen]
+ longer = Map.fromListWith (++)
+ [(rReading row !! prefixlen, [row])
+ | row <- rows, length (rReading row) > prefixlen]
+ in Node [(rLexeme r, rFreq r) | r <- here]
+ (Map.assocs (makeTrie (prefixlen + 1) <$> longer))
+
+-- Serialised format of the trie:
+-- - Node =
+-- - word8 NL
+-- - NL * Lexeme: lexemes with the current prefix as reading
+-- - word8 NS
+-- - NS * Edge: longer words
+-- - Lexeme =
+-- - word8 LEN
+-- - LEN * word8: the lexeme in UTF-8
+-- - word32LE: word frequency
+-- - Edge =
+-- - word8: the kana ID on this edge of the trie (add this to the end of the running prefix)
+-- - word32LE: absolute byte offset pointing to another Node
+--
+-- The root Node for the empty prefix is simply at the start of the file (byte offset 0).
+serialiseTrie :: Int -> Int -> Trie -> BSB.Builder
+serialiseTrie nIDs topoffset (Node lexemes subs)
+ | nIDs > 255 = error "Alphabet cannot be >255 in size with current database format"
+ | length lexemes > 255 = error $ "Too many lexemes for single reading: " ++ show lexemes
+ | otherwise =
+ let bwords = foldMap serialiseWord lexemes
+ in BSB.word8 (fromIntegral (length lexemes))
+ <> bwords
+ <> serialiseSubs (topoffset + 1 + BSB.builderLength bwords)
+ where
+ serialiseWord (word, freq) =
+ let bword = foldMap BSB.utf8Char word
+ len = BSB.builderLength bword
+ in if | len > 255 -> error $ "Word too long: " ++ word
+ | freq < 0 || freq > fromIntegral (maxBound :: Int32) -> error $ "Frequency too large: " ++ show freq
+ | otherwise -> BSB.word8 (fromIntegral len) <> bword <> word32LE (fromIntegral freq)
+
+ serialiseSubs subsoffset
+ | any (\c -> c < 0 || c >= nIDs) (map fst subs) || hasDuplicates (map fst subs) =
+ error $ "Invalid sub keys: " ++ show (map fst subs)
+ | otherwise = BSB.word8 (fromIntegral (length subs)) <> go subs mempty
+ where
+ restoffset = subsoffset + 1 + length subs * 5
+
+ go [] restBuilder = restBuilder
+ go ((key, trie) : subs') restBuilder =
+ let boffset = restoffset + BSB.builderLength restBuilder
+ b = serialiseTrie nIDs boffset trie
+ in BSB.word8 (fromIntegral key) <> word32LE (fromIntegral boffset) <> go subs' (restBuilder <> b)
+
+ word32LE :: Word32 -> BSB.Builder
+ word32LE w = mconcat [BSB.word8 (fromIntegral (w `shiftR` (8 * i))) | i <- [0 .. 3]]
+
+main :: IO ()
+main = do
+ (csvfname, outfname) <- getArgs >>= \case
+ [csvfname, outfname] -> return (csvfname, outfname)
+ _ -> die "Usage: process-database <database.csv> <output.bin>"
+
+ (hira2kata, kata2id, nIDs) <- readTable <$> readFile "table.txt"
+ let any2id = Map.map (kata2id Map.!) hira2kata <> kata2id
+
+ rows <- concatMap (normalise any2id) . tail . readCSV <$> readFile csvfname
+
+ let trie = makeTrie 0 rows
+ -- print trie
+ BS.writeFile outfname (BSB.builderBytes (serialiseTrie nIDs 0 trie))