aboutsummaryrefslogtreecommitdiff
path: root/process-database.hs
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2024-07-07 16:53:56 +0200
committerTom Smeding <tom@tomsmeding.com>2024-07-07 16:53:56 +0200
commitb8d7b530070d45482756a27be367bef5c1b735e0 (patch)
treeb9e72ccd0517fdb34c466864d97b08facfb7ce1c /process-database.hs
parent4b500bd4c69b481a611a61e72795c450120a6a7c (diff)
Kanji completion basically working
Diffstat (limited to 'process-database.hs')
-rwxr-xr-x[-rw-r--r--]process-database.hs12
1 files changed, 8 insertions, 4 deletions
diff --git a/process-database.hs b/process-database.hs
index 1fa9813..8ce60fd 100644..100755
--- a/process-database.hs
+++ b/process-database.hs
@@ -36,6 +36,10 @@ blockBy n l = case splitAt n l of
hasDuplicates :: Ord a => [a] -> Bool
hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l))
+dedup :: Eq a => [a] -> [a]
+dedup [] = []
+dedup (x:xs) = x : dedup (filter (/= x) xs)
+
data Row i = Row
{ rLexeme :: String
, rOrtho :: String
@@ -54,9 +58,9 @@ readCSV = map (toRow . splitOn ',') . lines
-- (hira->kata, kata->ID, num IDs)
readTable :: String -> (Map Char Char, Map Char Int, Int)
readTable input =
- let list = [(h, k) | [[h], [k]] <- map words (lines input)]
- in (Map.fromList list
- ,Map.fromList (zip (map snd list) [0..])
+ let list = [(h, k) | [h, k] <- map words (lines input)]
+ in (Map.fromList [(h, k) | ([h], [k]) <- list]
+ ,Map.fromList (zip [k | (_, [k]) <- list] [0..])
,length list)
normalise :: Map Char Int -> Row Char -> [Row Int]
@@ -88,7 +92,7 @@ makeTrie prefixlen rows =
longer = Map.fromListWith (++)
[(rReading row !! prefixlen, [row])
| row <- rows, length (rReading row) > prefixlen]
- in Node [(rLexeme r, rFreq r) | r <- here]
+ in Node [(target, rFreq r) | r <- here, target <- dedup [rLexeme r, rOrtho r]]
(Map.assocs (makeTrie (prefixlen + 1) <$> longer))
-- Serialised format of the trie: