Kanji completion basically working

author: Tom Smeding <tom@tomsmeding.com> 2024-07-07 16:53:56 +0200
committer: Tom Smeding <tom@tomsmeding.com> 2024-07-07 16:53:56 +0200
commit: b8d7b530070d45482756a27be367bef5c1b735e0 (patch)
tree: b9e72ccd0517fdb34c466864d97b08facfb7ce1c
parent: 4b500bd4c69b481a611a61e72795c450120a6a7c (diff)
3 files changed, 296 insertions, 119 deletions
diff --git a/README.txt b/README.txt
index c93cbe1..b54816a 100644
--- a/README.txt
+++ b/README.txt
@@ -12,5 +12,6 @@ Put the result in a CSV (say "database.csv") with 5 columns. It can be
 ascertained that the data from the spreadsheet does not contain commas in the
 selected columns, so the CSV conversion is safe.
 
-Then
-  $ cabal run process-database.hs -- database.csv
+Then:
+  $ ./process-database.hs database.csv database.bin
+creates the indexed database file that is read by the vim script.
diff --git a/japanese.vim b/japanese.vim
index ccd90a5..b2ed95c 100644
--- a/japanese.vim
+++ b/japanese.vim
@@ -1,79 +1,104 @@
 " Convention: If a variable holds a 1-based coordinate, its name is suffixed
 " with '_1'.
 
+" TODO:
+" - Remove the underline when completing, it's the wrong length anyway
+" - Make starting completion do something sensible when the cursor is not at
+"   the end of the current region
+" - Fix backspace while completing
+
 " ----------------------------- GENERIC HELPERS -------------------------------
 
-if !hlexists("JapaneseVimFragment")
+if !hlexists('JapaneseVimFragment')
   hi JapaneseVimFragment cterm=underline gui=underline
 endif
 
 function s:append_at_cursor(text)
-  const y_1 = line(".")
-  const xbytes = col(".") - 1
-  const ln = getline(".")
-  call setline(".", strpart(ln, 0, xbytes) .. a:text .. strpart(ln, xbytes))
+  const y_1 = line('.')
+  const xbytes = col('.') - 1
+  const ln = getline('.')
+  call setline('.', strpart(ln, 0, xbytes) .. a:text .. strpart(ln, xbytes))
   call cursor(y_1, xbytes + 1 + strlen(a:text))
 endfunction
 
 function s:isvowel(c)
-  return strcharlen(a:c) == 1 && stridx("aiueo", a:c) != -1
+  return strcharlen(a:c) == 1 && stridx('aiueo', a:c) != -1
 endfunction
 
 function s:isconsonant(c)
-  return strcharlen(a:c) == 1 && stridx("kgsztdnhbpmyrw", a:c) != -1
+  return strcharlen(a:c) == 1 && stridx('kgsztdnhbpmyrw', a:c) != -1
 endfunction
 
 
 " ---------------------------------- DATA -------------------------------------
 
-const s:nokana_digraph = ["yi", "ye", "wu"]
+const s:nokana_digraph = ['yi', 'ye', 'wu']
 
 const s:punctuation = {
-      \ ".": "。",
-      \ ",": "、",
-      \ "(": "（",
-      \ ")": "）",
-      \ "{": "｛",
-      \ "}": "｝",
-      \ "[": "「",
-      \ "]": "」",
-      \ "/": "・",
-      \ "~": "〜",
-      \ ":": "：",
-      \ "!": "！",
-      \ "?": "？",
+      \ '.': '。',
+      \ ',': '、',
+      \ '(': '（',
+      \ ')': '）',
+      \ '{': '｛',
+      \ '}': '｝',
+      \ '[': '「',
+      \ ']': '」',
+      \ '/': '・',
+      \ '~': '〜',
+      \ ':': '：',
+      \ '!': '！',
+      \ '?': '？',
       \ }
 
 let s:hira2kata = {}
 let s:kata2hira = {}
-let s:hira2id = {}
-let s:id2hira = []
+let s:kata2id = {}
+let s:id2kata = []
 function s:filltables()
   let i = 0
-  for ln in readfile("table.txt")
-    let words = split(ln, " ")
+  for ln in readfile('table.txt')
+    let words = split(ln, ' ')
     let s:hira2kata[words[0]] = words[1]
     let s:kata2hira[words[1]] = words[0]
-    let s:hira2id[words[0]] = i
-    call add(s:id2hira, words[0])
+    let s:kata2id[words[1]] = i
+    call add(s:id2kata, words[1])
     let i += 1
   endfor
 endfunction
 call s:filltables()
 lockvar! s:hira2kata
 lockvar! s:kata2hira
-lockvar! s:hira2id
-lockvar! s:id2hira
+lockvar! s:kata2id
+lockvar! s:id2kata
 
 try
-  const s:readingdb = readblob("database.bin")
+  const s:readingdb = readblob('database.bin')
 catch
-  echoe "Reading database not found, make sure you ran process-database.hs"
+  echoe 'Reading database not found, make sure you ran process-database.hs'
 endtry
 
+function s:iskana(c)
+  return has_key(s:hira2kata, a:c) || has_key(s:kata2hira, a:c)
+endfunction
+
+
+" ----------------------------- GLOBAL VARIABLES ------------------------------
+
+" The fragment currently being typed and formatted.
+" Dictionary as returned by s:make_region.
+let s:cur_region = v:null
+
+" Set when starting completion, undefined at other times (may not be reset to
+" v:null).
+let s:completion_orig_text = v:null
+
 
 " ------------------------ READING DATABASE FUNCTIONS -------------------------
 
+" This bound is not adhered to _precisely_, but collection stops soon after
+" hitting this limit.
+const s:rd_max_num_results = 1000
+
 function s:rd_read32le(offset)
   let n = s:readingdb[a:offset]
   let n += 0x100 * s:readingdb[a:offset+1]
@@ -82,40 +107,135 @@ function s:rd_read32le(offset)
   return n
 endfunction
 
-" Adds words to the a:accum list. The list is a list of pairs
-" [String, Number]: a word with its associated frequency.
-" Only words that share a maximal-length prefix with idlist will be added.
+" Adds words to the a:accum list. a:accum is a list of hits; a hit is a list:
+" [String   <- the dictionary lexeme
+" ,Number   <- frequency recorded in the dictionary
+" ,Number   <- number of kana in the reading of the lexeme
+" ,Number]  <- length of shared prefix between reading and given idlist
+" Only lexemes that share a maximal-length prefix with idlist are added.
 " If a:idlist = v:null, everything will be added.
-function s:rd_find_idlist_rec(idlist, prefixlen, offset, accum)
+function s:rd_find_idlist_rec(idlist, prefixlen, sharedprefixlen, offset, accum)
+  const should_accum = a:idlist is v:null || a:prefixlen >= len(a:idlist)
+
   const nlexemes = s:readingdb[a:offset]
 
-  let off = offset + 1
+  let off = a:offset + 1
   for i in range(nlexemes)
     let wordbytes = s:readingdb[off]
-    if a:idlist is v:null
+    if should_accum
       " A blob is converted to a string by interpreting it as UTF-8 when
       " marshalling it to Lua. Hence we have a convoluted UTF-8 parser right
       " here. Why does vim not have this built-in in a more sensible way?
-      let word = luaeval("_A", s:readingdb[off + 1 : off + wordbytes])
+      let word = luaeval('_A', s:readingdb[off + 1 : off + wordbytes])
       let freq = s:rd_read32le(off + 1 + wordbytes)
-      call add(a:accum, [word, freq])
+      call add(a:accum, [word, freq, a:prefixlen, a:sharedprefixlen])
     endif
     let off += 1 + wordbytes + 4
   endfor
 
-  " TODO: what exactly should we do here? Under what conditions should we
-  " recurse in what fashion?
+  " Prevent the number of results from getting out of hand
+  if len(a:accum) >= s:rd_max_num_results
+    return
+  endif
+
+  const nedges = s:readingdb[off]
+
+  let found_match = 0
+  let off += 1
+  for i in range(nedges)
+    let key = s:readingdb[off]
+    " if a:prefixlen == 1
+    "   echom "prefixlen=1 key=" .. string(key) .. " (nedges=" .. string(nedges) .. ")"
+    " endif
+    " If !should_accum then necessarily a:prefixlen < len(a:idlist)
+    if should_accum || key == a:idlist[a:prefixlen]
+      let found_match = 1
+      " if !should_accum
+      "   echom "found match for key=" .. string(key) .. " at prefixlen=" .. string(a:prefixlen)
+      " endif
+      call s:rd_find_idlist_rec(
+            \ a:idlist, a:prefixlen + 1, a:sharedprefixlen + !should_accum,
+            \ s:rd_read32le(off + 1), a:accum)
+    endif
+    let off += 5
+  endfor
 
-  " if a:idlist is v:null || len(a:idlist) > a:prefixlen
-  "   const nedges = s:readingdb[off]
+  if !found_match && !should_accum
+    " None of the continuations match, and yet we didn't decide yet that we
+    " should start accumulating, then this is the maximal shared prefix
+    " between the input idlist and the database. Hence we should start
+    " accumulating, which we do by simply re-visiting this node in
+    " accumulation mode.
+    call s:rd_find_idlist_rec(v:null, a:prefixlen, a:sharedprefixlen, a:offset, a:accum)
+  endif
+endfunction
+
+function s:rd_kana_to_id(kana)
+  const kata = get(s:hira2kata, a:kana, a:kana)
+  " Becomes -1 when not found
+  return get(s:kata2id, kata)
+endfunction
+
+" Returns list of hits; see s:rd_find_idlist_rec for what a hit is.
+function s:rd_find_kana(kanastr)
+  let idlist = []
+  for c in a:kanastr
+    call add(idlist, s:rd_kana_to_id(c))
+  endfor
+
+  let accum = []
+  call s:rd_find_idlist_rec(idlist, 0, 0, 0, accum)
+  return accum
+endfunction
+
+function s:rd_score_hit(hit)
+  return a:hit[1] * pow(0.85, a:hit[2] - a:hit[3])
+endfunction
+
+" Sorts in-place.
+function s:rd_sort_hits(hits)
+  " right - left results in descending sort.
+  " We don't need exact rounding here; if the scores are <1 apart, they're
+  " basically equal anyway. So float2nr() is enough.
+  call sort(a:hits, {h1, h2 -> float2nr(s:rd_score_hit(h2) - s:rd_score_hit(h1))})
+endfunction
+
+" Returns list of |complete-items| that can replace the kanastr.
+function s:rd_complete_kana(kanastr)
+  const kanacharlen = strcharlen(a:kanastr)
+
+  let hits = s:rd_find_kana(a:kanastr)
+  call s:rd_sort_hits(hits)
+  " echom "hits:" hits
+
+  let result = []
+  for hit in hits
+    " If the shared prefix length is less than the number of input kana, we
+    " have to replace the differing part of the hit with the remaining kana.
+    " We can only do this if the differing part is all-kana.
+    if hit[3] < kanacharlen
+      let prefixlen = strcharlen(hit[0]) - (hit[2] - hit[3])
+      let tail = strcharpart(hit[0], prefixlen)
+      let discard = 0
+      for c in tail
+        if !s:iskana(c)
+          let discard = 1
+          break
+        endif
+      endfor
+      if discard
+        continue
+      endif
+      let complword = strcharpart(hit[0], 0, prefixlen) .. strcharpart(a:kanastr, hit[3])
+    else
+      " The entire input was matched, so we can complete the whole hit.
+      let complword = hit[0]
+    endif
 
-  "   let off += 1
-  "   for i in range(nedges)
-  "     let key = s:readingdb[off]
+    call add(result, #{word: complword, equal: 1})
+  endfor
 
-  "     let off += 5
-  "   endfor
-  " endif
+  return result
 endfunction
 
 
@@ -123,7 +243,7 @@ endfunction
 
 function s:renderkana(s, kanamode)
   if len(a:s) == 0
-    return ""
+    return ''
   endif
 
   if a:kanamode == 1
@@ -131,57 +251,69 @@ function s:renderkana(s, kanamode)
   elseif a:kanamode == 2
     const hira = 0
   else
-    throw "Invalid kanamode " . a:kanamode
+    throw 'Invalid kanamode ' . a:kanamode
   endif
 
-  " echom "renderkana:" a:s strcharpart(a:s, 0, 1) s:isvowel(strcharpart(a:s, 0, 1))
+  " echom 'renderkana:' a:s strcharpart(a:s, 0, 1) s:isvowel(strcharpart(a:s, 0, 1))
 
-  let result = ""
+  let result = ''
   let i = 0
   while i < strcharlen(a:s)
     let c0 = strcharpart(a:s, i, 1)
     let c1 = strcharpart(a:s, i+1, 1)
     let c2 = strcharpart(a:s, i+2, 1)
 
-    " echom "render:" a:s i c0
+    " echom 'render:' a:s i c0
 
     " vowel kana
     if s:isvowel(c0)
-      let result ..= hira ? digraph_get(c0 .. "5") : digraph_get(toupper(c0) .. "6")
+      let result ..= hira ? digraph_get(c0 .. '5') : digraph_get(toupper(c0) .. '6')
       let i += 1
     " consonant-vowel kana
     elseif s:isconsonant(c0) && s:isvowel(c1) && index(s:nokana_digraph, c0 .. c1) == -1
       let result ..= hira ? digraph_get(c0 .. c1) : digraph_get(toupper(c0) .. c1)
       let i += 2
-    " v-dakuten kana
-    elseif c0 ==# "v" && s:isvowel(c1)
-      let kata = digraph_get("V" .. c1)
-      let result ..= hira ? s:kata2hira[kata] : kata
-      let i += 2
     " glide
-    elseif s:isconsonant(c0) && c1 ==# "y" && s:isvowel(c2)
-      let result ..= hira ? digraph_get(c0 .. "i") : digraph_get(toupper(c0) .. "i")
-      let result ..= hira ? digraph_get("y" .. toupper(c2)) : digraph_get("Y" .. toupper(c2))
+    elseif s:isconsonant(c0) && c1 ==# 'y' && s:isvowel(c2)
+      let result ..= hira ? digraph_get(c0 .. 'i') : digraph_get(toupper(c0) .. 'i')
+      let result ..= hira ? digraph_get('y' .. toupper(c2)) : digraph_get('Y' .. toupper(c2))
+      let i += 3
+    " t-h-i shorthand
+    elseif (c0 .. c1 .. c2) == "thi"
+      let result ..= hira ? 'てぃ' : 'ティ'
+      let i += 3
+    " d-h-i shorthand
+    elseif (c0 .. c1 .. c2) == "dhi"
+      let result ..= hira ? 'でぃ' : 'ディ'
       let i += 3
     " normal n
-    elseif c0 ==# "n" && c1 ==# "n"
-      let result ..= (hira ? "ん" : "ン")
+    elseif c0 ==# 'n' && c1 ==# 'n'
+      let result ..= (hira ? 'ん' : 'ン')
       let i += 2
     " implicit n
-    elseif c0 ==# "n" && c1 !=# "y" && s:isconsonant(c1)
-      let result ..= (hira ? "ん" : "ン")
+    elseif c0 ==# 'n' && c1 !=# 'y' && s:isconsonant(c1)
+      let result ..= (hira ? 'ん' : 'ン')
       let i += 1
     " geminated consonant
     elseif s:isconsonant(c0) && c0 ==# c1
-      let result ..= (hira ? "っ" : "ッ")
+      let result ..= (hira ? 'っ' : 'ッ')
       let i += 1
+    " v-kana shorthand
+    elseif c0 ==# 'v' && s:isvowel(c1)
+      let result ..= hira ? 'ゔ' .. digraph_get(toupper(c1) .. '5')
+                        \ : 'ヴ' .. digraph_get(c1 .. '6')
+      let i += 2
     " small vowel
-    elseif (c0 ==# "x" || c0 ==# "l") && s:isvowel(c1)
-      let result ..= hira ? digraph_get(toupper(c1) .. "5") : digraph_get(c1 .. "6")
+    elseif (c0 ==# 'x' || c0 ==# 'l') && s:isvowel(c1)
+      let result ..= hira ? digraph_get(toupper(c1) .. '5') : digraph_get(c1 .. '6')
       let i += 2
+    " small y-kana
+    elseif (c0 ==# 'x' || c0 ==# 'l') && c1 ==# 'y' && index(['a','u','o'], c2) != -1
+      let result ..= hira ? digraph_get('y' .. toupper(c2)) : digraph_get('Y' .. toupper(c2))
+      let i += 3
     " sokuon
-    elseif c0 == "-"
-      let result ..= "ー"
+    elseif c0 == '-'
+      let result ..= 'ー'
       let i += 1
     " puctuation
     elseif has_key(s:punctuation, c0)
@@ -201,7 +333,7 @@ function s:make_region_match(region)
   const ln = getline(a:region.y_1)
   const prefix = strcharpart(ln, 0, a:region.x1_1 - 1)
   const infix = strcharpart(ln, a:region.x1_1 - 1, a:region.x2_1 - a:region.x1_1)
-  return matchaddpos("JapaneseVimFragment", [[a:region.y_1, strlen(prefix) + 1, strlen(infix)]])
+  return matchaddpos('JapaneseVimFragment', [[a:region.y_1, strlen(prefix) + 1, strlen(infix)]])
 endfunction
 
 " y_1: 1-based line number of the region
@@ -221,10 +353,6 @@ function s:make_region(y_1, x1_1, x2_1, kanamode)
   return d
 endfunction
 
-" The fragment currently being typed and formatted.
-" Dictionary as returned by s:make_region.
-let s:cur_region = v:null
-
 function s:terminate_region()
   if s:cur_region isnot v:null
     call matchdelete(s:cur_region.matchid)
@@ -240,14 +368,20 @@ function s:refresh_region_matchid()
   const ln = getline(s:cur_region.y_1)
   const prefix = strcharpart(ln, 0, s:cur_region.x1_1 - 1)
   const infix = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1)
-  let s:cur_region.matchid = matchaddpos("JapaneseVimFragment", [[s:cur_region.y_1, strlen(prefix) + 1, strlen(infix)]])
+  let s:cur_region.matchid = matchaddpos('JapaneseVimFragment', [[s:cur_region.y_1, strlen(prefix) + 1, strlen(infix)]])
 endfunction
 
-function s:handle_keypress(key)
-  " call s:insert_at_cursor("key " .. a:key)
-  const y_1 = line(".")
-  const x = charcol(".") - 1
-  const fullline = getline(".")
+function s:handle_keypress_expr(key)
+  const escapedkey = a:key == "'" ? "''" : a:key
+  return (pumvisible() ? "\<C-y>\<Cmd>call " . expand('<SID>') . "terminate_region()\<CR>" : '') ..
+       \ "\<Cmd>call " .. expand('<SID>') .. "handle_keypress_cmd('" .. escapedkey ..  "')\<CR>"
+endfunction
+
+function s:handle_keypress_cmd(key)
+  " call s:insert_at_cursor('key ' .. a:key)
+  const y_1 = line('.')
+  const x = charcol('.') - 1
+  const fullline = getline('.')
 
   " If the user moved away from the region, start a new region here
   if s:cur_region isnot v:null && (y_1 != s:cur_region.y_1 || x + 1 < s:cur_region.x1_1 || x + 1 > s:cur_region.x2_1)
@@ -274,7 +408,7 @@ function s:handle_keypress(key)
 
   " echom y_1 x s:cur_region input result
 
-  call setline(".", strcharpart(fullline, 0, x - input_extra_chars) .. result .. strcharpart(fullline, x))
+  call setline('.', strcharpart(fullline, 0, x - input_extra_chars) .. result .. strcharpart(fullline, x))
   const newx_1 = x + 1 - input_extra_chars + strcharlen(result)
   call setcursorcharpos(y_1, newx_1)
 
@@ -288,14 +422,14 @@ endfunction
 
 function s:convert_region_kana(tokanamode)
   if s:cur_region is v:null
-    const x_1 = charcol(".")
-    let s:cur_region = s:make_region(line("."), x_1, x_1, a:tokanamode)
+    const x_1 = charcol('.')
+    let s:cur_region = s:make_region(line('.'), x_1, x_1, a:tokanamode)
     return
   endif
 
   const ln = getline(s:cur_region.y_1)
   const s = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1)
-  let result = ""
+  let result = ''
   for c in s
     " hiragana -> katakana
     if a:tokanamode == 2
@@ -318,43 +452,81 @@ function s:convert_region_kana(tokanamode)
   let s:cur_region.kanamode = a:tokanamode
 endfunction
 
-function s:handle_escape()
-  call s:terminate_region()
-  stopinsert
+function s:handle_escape_expr()
+  " <C-y>: accept completion and close completion menu (|complete_CTRL-Y|)
+  return (pumvisible() ? "\<C-y>" : '') ..
+       \ "\<Cmd>call " .. expand('<SID>') .. "terminate_region()\<CR>\<Esc>"
 endfunction
 
-function s:handle_space()
-  call s:terminate_region()
-  call s:append_at_cursor("　")
+function s:handle_space_expr()
+  return (pumvisible() ? "\<C-y>" : '') ..
+       \ "\<Cmd>call " .. expand('<SID>') .. "terminate_region()\<CR>　"
 endfunction
 
-function s:handle_enter()
-  if s:cur_region isnot v:null
-    call s:terminate_region()
-  else
-    " Simulate normal enter
-    const y_1 = line(".")
-    const xbytes = col(".") - 1
-    const ln = getline(".")
-    call setline(".", strpart(ln, 0, xbytes))
-    call append(y_1, strpart(ln, xbytes))
-    call cursor(y_1 + 1, 0)
-  endif
+function s:handle_enter_expr()
+  return (pumvisible() ? "\<C-y>" : '') ..
+       \ (s:cur_region isnot v:null ? "\<Cmd>call " .. expand('<SID>') .. "terminate_region()" : '') ..
+       \ "\<CR>"
+endfunction
+
+function s:handle_backspace_expr()
+  " <C-e>: *discard* completion and close completion menu (|complete_CTRL-E|)
+  return (pumvisible() ? "\<C-e>" : '') .. "\<BS>"
 endfunction
 
 function s:handle_hiragana()
   call s:convert_region_kana(1)
 endfunction
 
-function s:handle_katagana()
+function s:handle_katakana()
   call s:convert_region_kana(2)
 endfunction
 
-for c in "abcdefghijklmnopqrstuvwxyz-.,(){}[]/~:!?"
-  execute "inoremap <buffer> <silent> " .. c .. " <Cmd>call <SID>handle_keypress(\"" .. c .. "\")<CR>"
+function s:handle_tab_expr()
+  if pumvisible()
+    return "\<C-n>"  " next completion item
+  endif
+
+  if s:cur_region is v:null
+    return expand('<SID>') .. "orig\<Tab>"  " original tab binding if we're not typing
+  endif
+
+  " Add a <C-n> at the end to immediately select the first completion item
+  return "\<Cmd>call " .. expand('<SID>') .. "handle_tab_completion()\<CR>\<C-n>"
+endfunction
+
+function s:handle_shifttab_expr()
+  if pumvisible()
+    return "\<C-p>"  " previous completion item
+  endif
+
+  " return expand('<SID>') .. "orig\<S-Tab>"  " original binding
+  return ''
+endfunction
+
+function s:handle_tab_completion()
+  const ln = getline(s:cur_region.y_1)
+  const input = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1)
+  const startcol = strlen(strcharpart(ln, 0, s:cur_region.x1_1 - 1))
+
+  let s:completion_orig_text = input
+
+  " echom "complete_kana on" input
+  const list = s:rd_complete_kana(input)
+  " echom 'completing from ' .. string(startcol) .. ' with ' .. string(list)
+  call complete(startcol + 1, list)
+endfunction
+
+for c in 'abcdefghijklmnopqrstuvwxyz-.,(){}[]/~:!?'
+  execute 'inoremap <buffer> <expr> ' .. c .. ' <SID>handle_keypress_expr(''' .. c .. ''')'
 endfor
-inoremap <buffer> <silent> <Esc> <Cmd>call <SID>handle_escape()<CR>
-inoremap <buffer> <silent> <Space> <Cmd>call <SID>handle_space()<CR>
-inoremap <buffer> <silent> <CR> <Cmd>call <SID>handle_enter()<CR>
-inoremap <buffer> <silent> <C-h> <Cmd>call <SID>handle_hiragana()<CR>
-inoremap <buffer> <silent> <C-k> <Cmd>call <SID>handle_katagana()<CR>
+inoremap <buffer> <expr> <Esc> <SID>handle_escape_expr()
+inoremap <buffer> <expr> <Space> <SID>handle_space_expr()
+inoremap <buffer> <expr> <CR> <SID>handle_enter_expr()
+inoremap <buffer> <expr> <BS> <SID>handle_backspace_expr()
+execute 'inoremap <buffer> <SID>orig<Tab> ' .. maparg('<Tab>', 'i')
+inoremap <buffer> <expr> <Tab> <SID>handle_tab_expr()
+execute 'inoremap <buffer> <SID>orig<S-Tab> ' .. maparg('<S-Tab>', 'i')
+inoremap <buffer> <expr> <S-Tab> <SID>handle_shifttab_expr()
+inoremap <buffer> <C-h> <Cmd>call <SID>handle_hiragana()<CR>
+inoremap <buffer> <C-k> <Cmd>call <SID>handle_katakana()<CR>
diff --git a/process-database.hs b/process-database.hs
index 1fa9813..8ce60fd 100644..100755
--- a/process-database.hs
+++ b/process-database.hs
@@ -36,6 +36,10 @@ blockBy n l = case splitAt n l of
 hasDuplicates :: Ord a => [a] -> Bool
 hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l))
 
+dedup :: Eq a => [a] -> [a]
+dedup [] = []
+dedup (x:xs) = x : dedup (filter (/= x) xs)
+
 data Row i = Row
   { rLexeme :: String
   , rOrtho :: String
@@ -54,9 +58,9 @@ readCSV = map (toRow . splitOn ',') . lines
 -- (hira->kata, kata->ID, num IDs)
 readTable :: String -> (Map Char Char, Map Char Int, Int)
 readTable input =
-  let list = [(h, k) | [[h], [k]] <- map words (lines input)]
-  in (Map.fromList list
-     ,Map.fromList (zip (map snd list) [0..])
+  let list = [(h, k) | [h, k] <- map words (lines input)]
+  in (Map.fromList [(h, k) | ([h], [k]) <- list]
+     ,Map.fromList (zip [k | (_, [k]) <- list] [0..])
      ,length list)
 
 normalise :: Map Char Int -> Row Char -> [Row Int]
@@ -88,7 +92,7 @@ makeTrie prefixlen rows =
       longer = Map.fromListWith (++)
                  [(rReading row !! prefixlen, [row])
                  | row <- rows, length (rReading row) > prefixlen]
-  in Node [(rLexeme r, rFreq r) | r <- here]
+  in Node [(target, rFreq r) | r <- here, target <- dedup [rLexeme r, rOrtho r]]
           (Map.assocs (makeTrie (prefixlen + 1) <$> longer))
 
 -- Serialised format of the trie:
author	Tom Smeding <tom@tomsmeding.com>	2024-07-07 16:53:56 +0200
committer	Tom Smeding <tom@tomsmeding.com>	2024-07-07 16:53:56 +0200
commit	b8d7b530070d45482756a27be367bef5c1b735e0 (patch)
tree	b9e72ccd0517fdb34c466864d97b08facfb7ce1c
parent	4b500bd4c69b481a611a61e72795c450120a6a7c (diff)