process-database.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

#!/usr/bin/env cabal
{- cabal:
build-depends: base, containers, bytestring, bytestring-strict-builder
-}

{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE StrictData #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS -Wall #-}
module Main where

import qualified ByteString.StrictBuilder as BSB
import Data.Bits (shiftR)
import qualified Data.ByteString as BS
import Data.Foldable (toList)
import Data.Int (Int32)
import Data.List (sort)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Word (Word32)
import System.Environment (getArgs)
import System.Exit (die)


splitOn :: Eq a => a -> [a] -> [[a]]
splitOn sep l = case break (== sep) l of
                  (pre, _ : post) -> pre : splitOn sep post
                  (pre, []) -> [pre]

blockBy :: Int -> [a] -> [[a]]
blockBy n l = case splitAt n l of
                (pre, []) -> [pre]
                (pre, post) -> pre : blockBy n post

hasDuplicates :: Ord a => [a] -> Bool
hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l))

data Row i = Row
  { rLexeme :: String
  , rOrtho :: String
  , rReading :: [i]
  , rPOS :: String
  , rFreq :: Int }
  deriving (Show)

readCSV :: String -> [Row Char]
readCSV = map (toRow . splitOn ',') . lines
  where
    toRow :: [String] -> Row Char
    toRow [lx, ort, rea, pos, fre] = Row lx ort rea pos (read fre)
    toRow row = error $ "Invalid row: " ++ show row

-- (hira->kata, kata->ID, num IDs)
readTable :: String -> (Map Char Char, Map Char Int, Int)
readTable input =
  let list = [(h, k) | [[h], [k]] <- map words (lines input)]
  in (Map.fromList list
     ,Map.fromList (zip (map snd list) [0..])
     ,length list)

normalise :: Map Char Int -> Row Char -> [Row Int]
normalise any2id row = do
  rea <- normReading (rReading row)
  return row { rReading = rea }
  where
    normReading r = expandParens r >>= splitSlash >>= unKana
    expandParens r
      | (pre, '（' : r1) <- break (== '（') r
      , (inner, '）' : post) <- break (== '）') r1
      = [pre ++ post, pre ++ inner ++ post]
      | otherwise = [r]
    splitSlash = splitOn '／'
    unKana :: String -> [[Int]]
    unKana "0" = []
    unKana "#N/A" = []
    unKana r = toList (traverse unKanaChar r)
    unKanaChar :: Char -> Maybe Int
    unKanaChar c = Map.lookup c any2id

data Trie = Node [(String, Int)]  -- ^ Words with this reading
                 [(Int, Trie)]    -- ^ Sub-tries
  deriving (Show)

makeTrie :: Int -> [Row Int] -> Trie
makeTrie prefixlen rows =
  let here = [row | row <- rows, length (rReading row) == prefixlen]
      longer = Map.fromListWith (++)
                 [(rReading row !! prefixlen, [row])
                 | row <- rows, length (rReading row) > prefixlen]
  in Node [(rLexeme r, rFreq r) | r <- here]
          (Map.assocs (makeTrie (prefixlen + 1) <$> longer))

-- Serialised format of the trie:
-- - Node =
--   - word8 NL
--   - NL * Lexeme: lexemes with the current prefix as reading
--   - word8 NS
--   - NS * Edge: longer words
-- - Lexeme =
--   - word8 LEN
--   - LEN * word8: the lexeme in UTF-8
--   - word32LE: word frequency
-- - Edge =
--   - word8: the kana ID on this edge of the trie (add this to the end of the running prefix)
--   - word32LE: absolute byte offset pointing to another Node
--
-- The root Node for the empty prefix is simply at the start of the file (byte offset 0).
serialiseTrie :: Int -> Int -> Trie -> BSB.Builder
serialiseTrie nIDs topoffset (Node lexemes subs)
  | nIDs > 255 = error "Alphabet cannot be >255 in size with current database format"
  | length lexemes > 255 = error $ "Too many lexemes for single reading: " ++ show lexemes
  | otherwise =
      let bwords = foldMap serialiseWord lexemes
      in BSB.word8 (fromIntegral (length lexemes))
         <> bwords
         <> serialiseSubs (topoffset + 1 + BSB.builderLength bwords)
  where
    serialiseWord (word, freq) =
      let bword = foldMap BSB.utf8Char word
          len = BSB.builderLength bword
      in if | len > 255 -> error $ "Word too long: " ++ word
            | freq < 0 || freq > fromIntegral (maxBound :: Int32) -> error $ "Frequency too large: " ++ show freq
            | otherwise -> BSB.word8 (fromIntegral len) <> bword <> word32LE (fromIntegral freq)

    serialiseSubs subsoffset
      | any (\c -> c < 0 || c >= nIDs) (map fst subs) || hasDuplicates (map fst subs) =
          error $ "Invalid sub keys: " ++ show (map fst subs)
      | otherwise = BSB.word8 (fromIntegral (length subs)) <> go subs mempty
      where
        restoffset = subsoffset + 1 + length subs * 5

        go [] restBuilder = restBuilder
        go ((key, trie) : subs') restBuilder =
          let boffset = restoffset + BSB.builderLength restBuilder
              b = serialiseTrie nIDs boffset trie
          in BSB.word8 (fromIntegral key) <> word32LE (fromIntegral boffset) <> go subs' (restBuilder <> b)

    word32LE :: Word32 -> BSB.Builder
    word32LE w = mconcat [BSB.word8 (fromIntegral (w `shiftR` (8 * i))) | i <- [0 .. 3]]

main :: IO ()
main = do
  (csvfname, outfname) <- getArgs >>= \case
    [csvfname, outfname] -> return (csvfname, outfname)
    _ -> die "Usage: process-database <database.csv> <output.bin>"

  (hira2kata, kata2id, nIDs) <- readTable <$> readFile "table.txt"
  let any2id = Map.map (kata2id Map.!) hira2kata <> kata2id

  rows <- concatMap (normalise any2id) . tail . readCSV <$> readFile csvfname

  let trie = makeTrie 0 rows
  -- print trie
  BS.writeFile outfname (BSB.builderBytes (serialiseTrie nIDs 0 trie))