From 4b500bd4c69b481a611a61e72795c450120a6a7c Mon Sep 17 00:00:00 2001
From: Tom Smeding <tom@tomsmeding.com>
Date: Sat, 6 Jul 2024 23:12:16 +0200
Subject: More stuff

---
 process-database.hs | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 process-database.hs

(limited to 'process-database.hs')

diff --git a/process-database.hs b/process-database.hs
new file mode 100644
index 0000000..1fa9813
--- /dev/null
+++ b/process-database.hs
@@ -0,0 +1,155 @@
+#!/usr/bin/env cabal
+{- cabal:
+build-depends: base, containers, bytestring, bytestring-strict-builder
+-}
+
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE MultiWayIf #-}
+{-# LANGUAGE StrictData #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# OPTIONS -Wall #-}
+module Main where
+
+import qualified ByteString.StrictBuilder as BSB
+import Data.Bits (shiftR)
+import qualified Data.ByteString as BS
+import Data.Foldable (toList)
+import Data.Int (Int32)
+import Data.List (sort)
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as Map
+import Data.Word (Word32)
+import System.Environment (getArgs)
+import System.Exit (die)
+
+
+splitOn :: Eq a => a -> [a] -> [[a]]
+splitOn sep l = case break (== sep) l of
+                  (pre, _ : post) -> pre : splitOn sep post
+                  (pre, []) -> [pre]
+
+blockBy :: Int -> [a] -> [[a]]
+blockBy n l = case splitAt n l of
+                (pre, []) -> [pre]
+                (pre, post) -> pre : blockBy n post
+
+hasDuplicates :: Ord a => [a] -> Bool
+hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l))
+
+data Row i = Row
+  { rLexeme :: String
+  , rOrtho :: String
+  , rReading :: [i]
+  , rPOS :: String
+  , rFreq :: Int }
+  deriving (Show)
+
+readCSV :: String -> [Row Char]
+readCSV = map (toRow . splitOn ',') . lines
+  where
+    toRow :: [String] -> Row Char
+    toRow [lx, ort, rea, pos, fre] = Row lx ort rea pos (read fre)
+    toRow row = error $ "Invalid row: " ++ show row
+
+-- (hira->kata, kata->ID, num IDs)
+readTable :: String -> (Map Char Char, Map Char Int, Int)
+readTable input =
+  let list = [(h, k) | [[h], [k]] <- map words (lines input)]
+  in (Map.fromList list
+     ,Map.fromList (zip (map snd list) [0..])
+     ,length list)
+
+normalise :: Map Char Int -> Row Char -> [Row Int]
+normalise any2id row = do
+  rea <- normReading (rReading row)
+  return row { rReading = rea }
+  where
+    normReading r = expandParens r >>= splitSlash >>= unKana
+    expandParens r
+      | (pre, '（' : r1) <- break (== '（') r
+      , (inner, '）' : post) <- break (== '）') r1
+      = [pre ++ post, pre ++ inner ++ post]
+      | otherwise = [r]
+    splitSlash = splitOn '／'
+    unKana :: String -> [[Int]]
+    unKana "0" = []
+    unKana "#N/A" = []
+    unKana r = toList (traverse unKanaChar r)
+    unKanaChar :: Char -> Maybe Int
+    unKanaChar c = Map.lookup c any2id
+
+data Trie = Node [(String, Int)]  -- ^ Words with this reading
+                 [(Int, Trie)]    -- ^ Sub-tries
+  deriving (Show)
+
+makeTrie :: Int -> [Row Int] -> Trie
+makeTrie prefixlen rows =
+  let here = [row | row <- rows, length (rReading row) == prefixlen]
+      longer = Map.fromListWith (++)
+                 [(rReading row !! prefixlen, [row])
+                 | row <- rows, length (rReading row) > prefixlen]
+  in Node [(rLexeme r, rFreq r) | r <- here]
+          (Map.assocs (makeTrie (prefixlen + 1) <$> longer))
+
+-- Serialised format of the trie:
+-- - Node =
+--   - word8 NL
+--   - NL * Lexeme: lexemes with the current prefix as reading
+--   - word8 NS
+--   - NS * Edge: longer words
+-- - Lexeme =
+--   - word8 LEN
+--   - LEN * word8: the lexeme in UTF-8
+--   - word32LE: word frequency
+-- - Edge =
+--   - word8: the kana ID on this edge of the trie (add this to the end of the running prefix)
+--   - word32LE: absolute byte offset pointing to another Node
+--
+-- The root Node for the empty prefix is simply at the start of the file (byte offset 0).
+serialiseTrie :: Int -> Int -> Trie -> BSB.Builder
+serialiseTrie nIDs topoffset (Node lexemes subs)
+  | nIDs > 255 = error "Alphabet cannot be >255 in size with current database format"
+  | length lexemes > 255 = error $ "Too many lexemes for single reading: " ++ show lexemes
+  | otherwise =
+      let bwords = foldMap serialiseWord lexemes
+      in BSB.word8 (fromIntegral (length lexemes))
+         <> bwords
+         <> serialiseSubs (topoffset + 1 + BSB.builderLength bwords)
+  where
+    serialiseWord (word, freq) =
+      let bword = foldMap BSB.utf8Char word
+          len = BSB.builderLength bword
+      in if | len > 255 -> error $ "Word too long: " ++ word
+            | freq < 0 || freq > fromIntegral (maxBound :: Int32) -> error $ "Frequency too large: " ++ show freq
+            | otherwise -> BSB.word8 (fromIntegral len) <> bword <> word32LE (fromIntegral freq)
+
+    serialiseSubs subsoffset
+      | any (\c -> c < 0 || c >= nIDs) (map fst subs) || hasDuplicates (map fst subs) =
+          error $ "Invalid sub keys: " ++ show (map fst subs)
+      | otherwise = BSB.word8 (fromIntegral (length subs)) <> go subs mempty
+      where
+        restoffset = subsoffset + 1 + length subs * 5
+
+        go [] restBuilder = restBuilder
+        go ((key, trie) : subs') restBuilder =
+          let boffset = restoffset + BSB.builderLength restBuilder
+              b = serialiseTrie nIDs boffset trie
+          in BSB.word8 (fromIntegral key) <> word32LE (fromIntegral boffset) <> go subs' (restBuilder <> b)
+
+    word32LE :: Word32 -> BSB.Builder
+    word32LE w = mconcat [BSB.word8 (fromIntegral (w `shiftR` (8 * i))) | i <- [0 .. 3]]
+
+main :: IO ()
+main = do
+  (csvfname, outfname) <- getArgs >>= \case
+    [csvfname, outfname] -> return (csvfname, outfname)
+    _ -> die "Usage: process-database <database.csv> <output.bin>"
+
+  (hira2kata, kata2id, nIDs) <- readTable <$> readFile "table.txt"
+  let any2id = Map.map (kata2id Map.!) hira2kata <> kata2id
+
+  rows <- concatMap (normalise any2id) . tail . readCSV <$> readFile csvfname
+
+  let trie = makeTrie 0 rows
+  -- print trie
+  BS.writeFile outfname (BSB.builderBytes (serialiseTrie nIDs 0 trie))
-- 
cgit v1.2.3-70-g09d2