From ae280f4dd6f96a2eb01d1d1fc05be5e7c7f8bbf1 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sun, 7 Jul 2024 17:50:53 +0200 Subject: Filter some nonsensical lexemes from the data --- process-database.hs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/process-database.hs b/process-database.hs index 8ce60fd..bfbd4b7 100755 --- a/process-database.hs +++ b/process-database.hs @@ -92,7 +92,12 @@ makeTrie prefixlen rows = longer = Map.fromListWith (++) [(rReading row !! prefixlen, [row]) | row <- rows, length (rReading row) > prefixlen] - in Node [(target, rFreq r) | r <- here, target <- dedup [rLexeme r, rOrtho r]] + in Node [(target, rFreq r) + | r <- here + , target <- dedup [rLexeme r, rOrtho r] + , not (null target) + , target `notElem` ["*", "0", "#N/A"] -- 隘 has "*" as "orthography" + , all (`notElem` ['a'..'z']) target] -- "アイ鱻eye" is not a sensible lexeme (Map.assocs (makeTrie (prefixlen + 1) <$> longer)) -- Serialised format of the trie: -- cgit v1.2.3-70-g09d2