diff options
Diffstat (limited to 'process-database.hs')
-rwxr-xr-x[-rw-r--r--] | process-database.hs | 12 |
1 files changed, 8 insertions, 4 deletions
diff --git a/process-database.hs b/process-database.hs index 1fa9813..8ce60fd 100644..100755 --- a/process-database.hs +++ b/process-database.hs @@ -36,6 +36,10 @@ blockBy n l = case splitAt n l of hasDuplicates :: Ord a => [a] -> Bool hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l)) +dedup :: Eq a => [a] -> [a] +dedup [] = [] +dedup (x:xs) = x : dedup (filter (/= x) xs) + data Row i = Row { rLexeme :: String , rOrtho :: String @@ -54,9 +58,9 @@ readCSV = map (toRow . splitOn ',') . lines -- (hira->kata, kata->ID, num IDs) readTable :: String -> (Map Char Char, Map Char Int, Int) readTable input = - let list = [(h, k) | [[h], [k]] <- map words (lines input)] - in (Map.fromList list - ,Map.fromList (zip (map snd list) [0..]) + let list = [(h, k) | [h, k] <- map words (lines input)] + in (Map.fromList [(h, k) | ([h], [k]) <- list] + ,Map.fromList (zip [k | (_, [k]) <- list] [0..]) ,length list) normalise :: Map Char Int -> Row Char -> [Row Int] @@ -88,7 +92,7 @@ makeTrie prefixlen rows = longer = Map.fromListWith (++) [(rReading row !! prefixlen, [row]) | row <- rows, length (rReading row) > prefixlen] - in Node [(rLexeme r, rFreq r) | r <- here] + in Node [(target, rFreq r) | r <- here, target <- dedup [rLexeme r, rOrtho r]] (Map.assocs (makeTrie (prefixlen + 1) <$> longer)) -- Serialised format of the trie: |