#!/usr/bin/env cabal {- cabal: build-depends: base, containers, bytestring, bytestring-strict-builder -} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiWayIf #-} {-# LANGUAGE StrictData #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS -Wall #-} module Main where import qualified ByteString.StrictBuilder as BSB import Data.Bits (shiftR) import qualified Data.ByteString as BS import Data.Foldable (toList) import Data.Int (Int32) import Data.List (sort) import Data.Map.Strict (Map) import qualified Data.Map.Strict as Map import Data.Word (Word32) import System.Environment (getArgs) import System.Exit (die) splitOn :: Eq a => a -> [a] -> [[a]] splitOn sep l = case break (== sep) l of (pre, _ : post) -> pre : splitOn sep post (pre, []) -> [pre] blockBy :: Int -> [a] -> [[a]] blockBy n l = case splitAt n l of (pre, []) -> [pre] (pre, post) -> pre : blockBy n post hasDuplicates :: Ord a => [a] -> Bool hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l)) data Row i = Row { rLexeme :: String , rOrtho :: String , rReading :: [i] , rPOS :: String , rFreq :: Int } deriving (Show) readCSV :: String -> [Row Char] readCSV = map (toRow . splitOn ',') . lines where toRow :: [String] -> Row Char toRow [lx, ort, rea, pos, fre] = Row lx ort rea pos (read fre) toRow row = error $ "Invalid row: " ++ show row -- (hira->kata, kata->ID, num IDs) readTable :: String -> (Map Char Char, Map Char Int, Int) readTable input = let list = [(h, k) | [[h], [k]] <- map words (lines input)] in (Map.fromList list ,Map.fromList (zip (map snd list) [0..]) ,length list) normalise :: Map Char Int -> Row Char -> [Row Int] normalise any2id row = do rea <- normReading (rReading row) return row { rReading = rea } where normReading r = expandParens r >>= splitSlash >>= unKana expandParens r | (pre, '(' : r1) <- break (== '(') r , (inner, ')' : post) <- break (== ')') r1 = [pre ++ post, pre ++ inner ++ post] | otherwise = [r] splitSlash = splitOn '/' unKana :: String -> [[Int]] unKana "0" = [] unKana "#N/A" = [] unKana r = toList (traverse unKanaChar r) unKanaChar :: Char -> Maybe Int unKanaChar c = Map.lookup c any2id data Trie = Node [(String, Int)] -- ^ Words with this reading [(Int, Trie)] -- ^ Sub-tries deriving (Show) makeTrie :: Int -> [Row Int] -> Trie makeTrie prefixlen rows = let here = [row | row <- rows, length (rReading row) == prefixlen] longer = Map.fromListWith (++) [(rReading row !! prefixlen, [row]) | row <- rows, length (rReading row) > prefixlen] in Node [(rLexeme r, rFreq r) | r <- here] (Map.assocs (makeTrie (prefixlen + 1) <$> longer)) -- Serialised format of the trie: -- - Node = -- - word8 NL -- - NL * Lexeme: lexemes with the current prefix as reading -- - word8 NS -- - NS * Edge: longer words -- - Lexeme = -- - word8 LEN -- - LEN * word8: the lexeme in UTF-8 -- - word32LE: word frequency -- - Edge = -- - word8: the kana ID on this edge of the trie (add this to the end of the running prefix) -- - word32LE: absolute byte offset pointing to another Node -- -- The root Node for the empty prefix is simply at the start of the file (byte offset 0). serialiseTrie :: Int -> Int -> Trie -> BSB.Builder serialiseTrie nIDs topoffset (Node lexemes subs) | nIDs > 255 = error "Alphabet cannot be >255 in size with current database format" | length lexemes > 255 = error $ "Too many lexemes for single reading: " ++ show lexemes | otherwise = let bwords = foldMap serialiseWord lexemes in BSB.word8 (fromIntegral (length lexemes)) <> bwords <> serialiseSubs (topoffset + 1 + BSB.builderLength bwords) where serialiseWord (word, freq) = let bword = foldMap BSB.utf8Char word len = BSB.builderLength bword in if | len > 255 -> error $ "Word too long: " ++ word | freq < 0 || freq > fromIntegral (maxBound :: Int32) -> error $ "Frequency too large: " ++ show freq | otherwise -> BSB.word8 (fromIntegral len) <> bword <> word32LE (fromIntegral freq) serialiseSubs subsoffset | any (\c -> c < 0 || c >= nIDs) (map fst subs) || hasDuplicates (map fst subs) = error $ "Invalid sub keys: " ++ show (map fst subs) | otherwise = BSB.word8 (fromIntegral (length subs)) <> go subs mempty where restoffset = subsoffset + 1 + length subs * 5 go [] restBuilder = restBuilder go ((key, trie) : subs') restBuilder = let boffset = restoffset + BSB.builderLength restBuilder b = serialiseTrie nIDs boffset trie in BSB.word8 (fromIntegral key) <> word32LE (fromIntegral boffset) <> go subs' (restBuilder <> b) word32LE :: Word32 -> BSB.Builder word32LE w = mconcat [BSB.word8 (fromIntegral (w `shiftR` (8 * i))) | i <- [0 .. 3]] main :: IO () main = do (csvfname, outfname) <- getArgs >>= \case [csvfname, outfname] -> return (csvfname, outfname) _ -> die "Usage: process-database " (hira2kata, kata2id, nIDs) <- readTable <$> readFile "table.txt" let any2id = Map.map (kata2id Map.!) hira2kata <> kata2id rows <- concatMap (normalise any2id) . tail . readCSV <$> readFile csvfname let trie = makeTrie 0 rows -- print trie BS.writeFile outfname (BSB.builderBytes (serialiseTrie nIDs 0 trie))