aboutsummaryrefslogtreecommitdiff
path: root/process-database.hs
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2024-07-07 17:50:53 +0200
committerTom Smeding <tom@tomsmeding.com>2024-07-07 17:50:53 +0200
commitae280f4dd6f96a2eb01d1d1fc05be5e7c7f8bbf1 (patch)
tree200903e2d897f19cba4c37f7c03f0e6811fdd0dc /process-database.hs
parent37346eaed90376979255eabc82725ceea77eabb5 (diff)
Filter some nonsensical lexemes from the data
Diffstat (limited to 'process-database.hs')
-rwxr-xr-xprocess-database.hs7
1 files changed, 6 insertions, 1 deletions
diff --git a/process-database.hs b/process-database.hs
index 8ce60fd..bfbd4b7 100755
--- a/process-database.hs
+++ b/process-database.hs
@@ -92,7 +92,12 @@ makeTrie prefixlen rows =
longer = Map.fromListWith (++)
[(rReading row !! prefixlen, [row])
| row <- rows, length (rReading row) > prefixlen]
- in Node [(target, rFreq r) | r <- here, target <- dedup [rLexeme r, rOrtho r]]
+ in Node [(target, rFreq r)
+ | r <- here
+ , target <- dedup [rLexeme r, rOrtho r]
+ , not (null target)
+ , target `notElem` ["*", "0", "#N/A"] -- 隘 has "*" as "orthography"
+ , all (`notElem` ['a'..'z']) target] -- "アイ鱻eye" is not a sensible lexeme
(Map.assocs (makeTrie (prefixlen + 1) <$> longer))
-- Serialised format of the trie: