1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
#!/usr/bin/env cabal
{- cabal:
build-depends: base, containers, bytestring, bytestring-strict-builder
-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE StrictData #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS -Wall #-}
module Main where
import qualified ByteString.StrictBuilder as BSB
import Data.Bits (shiftR)
import qualified Data.ByteString as BS
import Data.Foldable (toList)
import Data.Int (Int32)
import Data.List (sort)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Word (Word32)
import System.Environment (getArgs)
import System.Exit (die)
splitOn :: Eq a => a -> [a] -> [[a]]
splitOn sep l = case break (== sep) l of
(pre, _ : post) -> pre : splitOn sep post
(pre, []) -> [pre]
blockBy :: Int -> [a] -> [[a]]
blockBy n l = case splitAt n l of
(pre, []) -> [pre]
(pre, post) -> pre : blockBy n post
hasDuplicates :: Ord a => [a] -> Bool
hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l))
data Row i = Row
{ rLexeme :: String
, rOrtho :: String
, rReading :: [i]
, rPOS :: String
, rFreq :: Int }
deriving (Show)
readCSV :: String -> [Row Char]
readCSV = map (toRow . splitOn ',') . lines
where
toRow :: [String] -> Row Char
toRow [lx, ort, rea, pos, fre] = Row lx ort rea pos (read fre)
toRow row = error $ "Invalid row: " ++ show row
-- (hira->kata, kata->ID, num IDs)
readTable :: String -> (Map Char Char, Map Char Int, Int)
readTable input =
let list = [(h, k) | [[h], [k]] <- map words (lines input)]
in (Map.fromList list
,Map.fromList (zip (map snd list) [0..])
,length list)
normalise :: Map Char Int -> Row Char -> [Row Int]
normalise any2id row = do
rea <- normReading (rReading row)
return row { rReading = rea }
where
normReading r = expandParens r >>= splitSlash >>= unKana
expandParens r
| (pre, '(' : r1) <- break (== '(') r
, (inner, ')' : post) <- break (== ')') r1
= [pre ++ post, pre ++ inner ++ post]
| otherwise = [r]
splitSlash = splitOn '/'
unKana :: String -> [[Int]]
unKana "0" = []
unKana "#N/A" = []
unKana r = toList (traverse unKanaChar r)
unKanaChar :: Char -> Maybe Int
unKanaChar c = Map.lookup c any2id
data Trie = Node [(String, Int)] -- ^ Words with this reading
[(Int, Trie)] -- ^ Sub-tries
deriving (Show)
makeTrie :: Int -> [Row Int] -> Trie
makeTrie prefixlen rows =
let here = [row | row <- rows, length (rReading row) == prefixlen]
longer = Map.fromListWith (++)
[(rReading row !! prefixlen, [row])
| row <- rows, length (rReading row) > prefixlen]
in Node [(rLexeme r, rFreq r) | r <- here]
(Map.assocs (makeTrie (prefixlen + 1) <$> longer))
-- Serialised format of the trie:
-- - Node =
-- - word8 NL
-- - NL * Lexeme: lexemes with the current prefix as reading
-- - word8 NS
-- - NS * Edge: longer words
-- - Lexeme =
-- - word8 LEN
-- - LEN * word8: the lexeme in UTF-8
-- - word32LE: word frequency
-- - Edge =
-- - word8: the kana ID on this edge of the trie (add this to the end of the running prefix)
-- - word32LE: absolute byte offset pointing to another Node
--
-- The root Node for the empty prefix is simply at the start of the file (byte offset 0).
serialiseTrie :: Int -> Int -> Trie -> BSB.Builder
serialiseTrie nIDs topoffset (Node lexemes subs)
| nIDs > 255 = error "Alphabet cannot be >255 in size with current database format"
| length lexemes > 255 = error $ "Too many lexemes for single reading: " ++ show lexemes
| otherwise =
let bwords = foldMap serialiseWord lexemes
in BSB.word8 (fromIntegral (length lexemes))
<> bwords
<> serialiseSubs (topoffset + 1 + BSB.builderLength bwords)
where
serialiseWord (word, freq) =
let bword = foldMap BSB.utf8Char word
len = BSB.builderLength bword
in if | len > 255 -> error $ "Word too long: " ++ word
| freq < 0 || freq > fromIntegral (maxBound :: Int32) -> error $ "Frequency too large: " ++ show freq
| otherwise -> BSB.word8 (fromIntegral len) <> bword <> word32LE (fromIntegral freq)
serialiseSubs subsoffset
| any (\c -> c < 0 || c >= nIDs) (map fst subs) || hasDuplicates (map fst subs) =
error $ "Invalid sub keys: " ++ show (map fst subs)
| otherwise = BSB.word8 (fromIntegral (length subs)) <> go subs mempty
where
restoffset = subsoffset + 1 + length subs * 5
go [] restBuilder = restBuilder
go ((key, trie) : subs') restBuilder =
let boffset = restoffset + BSB.builderLength restBuilder
b = serialiseTrie nIDs boffset trie
in BSB.word8 (fromIntegral key) <> word32LE (fromIntegral boffset) <> go subs' (restBuilder <> b)
word32LE :: Word32 -> BSB.Builder
word32LE w = mconcat [BSB.word8 (fromIntegral (w `shiftR` (8 * i))) | i <- [0 .. 3]]
main :: IO ()
main = do
(csvfname, outfname) <- getArgs >>= \case
[csvfname, outfname] -> return (csvfname, outfname)
_ -> die "Usage: process-database <database.csv> <output.bin>"
(hira2kata, kata2id, nIDs) <- readTable <$> readFile "table.txt"
let any2id = Map.map (kata2id Map.!) hira2kata <> kata2id
rows <- concatMap (normalise any2id) . tail . readCSV <$> readFile csvfname
let trie = makeTrie 0 rows
-- print trie
BS.writeFile outfname (BSB.builderBytes (serialiseTrie nIDs 0 trie))
|