aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2024-07-07 16:53:56 +0200
committerTom Smeding <tom@tomsmeding.com>2024-07-07 16:53:56 +0200
commitb8d7b530070d45482756a27be367bef5c1b735e0 (patch)
treeb9e72ccd0517fdb34c466864d97b08facfb7ce1c
parent4b500bd4c69b481a611a61e72795c450120a6a7c (diff)
Kanji completion basically working
-rw-r--r--README.txt5
-rw-r--r--japanese.vim398
-rwxr-xr-x[-rw-r--r--]process-database.hs12
3 files changed, 296 insertions, 119 deletions
diff --git a/README.txt b/README.txt
index c93cbe1..b54816a 100644
--- a/README.txt
+++ b/README.txt
@@ -12,5 +12,6 @@ Put the result in a CSV (say "database.csv") with 5 columns. It can be
ascertained that the data from the spreadsheet does not contain commas in the
selected columns, so the CSV conversion is safe.
-Then
- $ cabal run process-database.hs -- database.csv
+Then:
+ $ ./process-database.hs database.csv database.bin
+creates the indexed database file that is read by the vim script.
diff --git a/japanese.vim b/japanese.vim
index ccd90a5..b2ed95c 100644
--- a/japanese.vim
+++ b/japanese.vim
@@ -1,79 +1,104 @@
" Convention: If a variable holds a 1-based coordinate, its name is suffixed
" with '_1'.
+" TODO:
+" - Remove the underline when completing, it's the wrong length anyway
+" - Make starting completion do something sensible when the cursor is not at
+" the end of the current region
+" - Fix backspace while completing
+
" ----------------------------- GENERIC HELPERS -------------------------------
-if !hlexists("JapaneseVimFragment")
+if !hlexists('JapaneseVimFragment')
hi JapaneseVimFragment cterm=underline gui=underline
endif
function s:append_at_cursor(text)
- const y_1 = line(".")
- const xbytes = col(".") - 1
- const ln = getline(".")
- call setline(".", strpart(ln, 0, xbytes) .. a:text .. strpart(ln, xbytes))
+ const y_1 = line('.')
+ const xbytes = col('.') - 1
+ const ln = getline('.')
+ call setline('.', strpart(ln, 0, xbytes) .. a:text .. strpart(ln, xbytes))
call cursor(y_1, xbytes + 1 + strlen(a:text))
endfunction
function s:isvowel(c)
- return strcharlen(a:c) == 1 && stridx("aiueo", a:c) != -1
+ return strcharlen(a:c) == 1 && stridx('aiueo', a:c) != -1
endfunction
function s:isconsonant(c)
- return strcharlen(a:c) == 1 && stridx("kgsztdnhbpmyrw", a:c) != -1
+ return strcharlen(a:c) == 1 && stridx('kgsztdnhbpmyrw', a:c) != -1
endfunction
" ---------------------------------- DATA -------------------------------------
-const s:nokana_digraph = ["yi", "ye", "wu"]
+const s:nokana_digraph = ['yi', 'ye', 'wu']
const s:punctuation = {
- \ ".": "。",
- \ ",": "、",
- \ "(": "(",
- \ ")": ")",
- \ "{": "{",
- \ "}": "}",
- \ "[": "「",
- \ "]": "」",
- \ "/": "・",
- \ "~": "〜",
- \ ":": ":",
- \ "!": "!",
- \ "?": "?",
+ \ '.': '。',
+ \ ',': '、',
+ \ '(': '(',
+ \ ')': ')',
+ \ '{': '{',
+ \ '}': '}',
+ \ '[': '「',
+ \ ']': '」',
+ \ '/': '・',
+ \ '~': '〜',
+ \ ':': ':',
+ \ '!': '!',
+ \ '?': '?',
\ }
let s:hira2kata = {}
let s:kata2hira = {}
-let s:hira2id = {}
-let s:id2hira = []
+let s:kata2id = {}
+let s:id2kata = []
function s:filltables()
let i = 0
- for ln in readfile("table.txt")
- let words = split(ln, " ")
+ for ln in readfile('table.txt')
+ let words = split(ln, ' ')
let s:hira2kata[words[0]] = words[1]
let s:kata2hira[words[1]] = words[0]
- let s:hira2id[words[0]] = i
- call add(s:id2hira, words[0])
+ let s:kata2id[words[1]] = i
+ call add(s:id2kata, words[1])
let i += 1
endfor
endfunction
call s:filltables()
lockvar! s:hira2kata
lockvar! s:kata2hira
-lockvar! s:hira2id
-lockvar! s:id2hira
+lockvar! s:kata2id
+lockvar! s:id2kata
try
- const s:readingdb = readblob("database.bin")
+ const s:readingdb = readblob('database.bin')
catch
- echoe "Reading database not found, make sure you ran process-database.hs"
+ echoe 'Reading database not found, make sure you ran process-database.hs'
endtry
+function s:iskana(c)
+ return has_key(s:hira2kata, a:c) || has_key(s:kata2hira, a:c)
+endfunction
+
+
+" ----------------------------- GLOBAL VARIABLES ------------------------------
+
+" The fragment currently being typed and formatted.
+" Dictionary as returned by s:make_region.
+let s:cur_region = v:null
+
+" Set when starting completion, undefined at other times (may not be reset to
+" v:null).
+let s:completion_orig_text = v:null
+
" ------------------------ READING DATABASE FUNCTIONS -------------------------
+" This bound is not adhered to _precisely_, but collection stops soon after
+" hitting this limit.
+const s:rd_max_num_results = 1000
+
function s:rd_read32le(offset)
let n = s:readingdb[a:offset]
let n += 0x100 * s:readingdb[a:offset+1]
@@ -82,40 +107,135 @@ function s:rd_read32le(offset)
return n
endfunction
-" Adds words to the a:accum list. The list is a list of pairs
-" [String, Number]: a word with its associated frequency.
-" Only words that share a maximal-length prefix with idlist will be added.
+" Adds words to the a:accum list. a:accum is a list of hits; a hit is a list:
+" [String <- the dictionary lexeme
+" ,Number <- frequency recorded in the dictionary
+" ,Number <- number of kana in the reading of the lexeme
+" ,Number] <- length of shared prefix between reading and given idlist
+" Only lexemes that share a maximal-length prefix with idlist are added.
" If a:idlist = v:null, everything will be added.
-function s:rd_find_idlist_rec(idlist, prefixlen, offset, accum)
+function s:rd_find_idlist_rec(idlist, prefixlen, sharedprefixlen, offset, accum)
+ const should_accum = a:idlist is v:null || a:prefixlen >= len(a:idlist)
+
const nlexemes = s:readingdb[a:offset]
- let off = offset + 1
+ let off = a:offset + 1
for i in range(nlexemes)
let wordbytes = s:readingdb[off]
- if a:idlist is v:null
+ if should_accum
" A blob is converted to a string by interpreting it as UTF-8 when
" marshalling it to Lua. Hence we have a convoluted UTF-8 parser right
" here. Why does vim not have this built-in in a more sensible way?
- let word = luaeval("_A", s:readingdb[off + 1 : off + wordbytes])
+ let word = luaeval('_A', s:readingdb[off + 1 : off + wordbytes])
let freq = s:rd_read32le(off + 1 + wordbytes)
- call add(a:accum, [word, freq])
+ call add(a:accum, [word, freq, a:prefixlen, a:sharedprefixlen])
endif
let off += 1 + wordbytes + 4
endfor
- " TODO: what exactly should we do here? Under what conditions should we
- " recurse in what fashion?
+ " Prevent the number of results from getting out of hand
+ if len(a:accum) >= s:rd_max_num_results
+ return
+ endif
+
+ const nedges = s:readingdb[off]
+
+ let found_match = 0
+ let off += 1
+ for i in range(nedges)
+ let key = s:readingdb[off]
+ " if a:prefixlen == 1
+ " echom "prefixlen=1 key=" .. string(key) .. " (nedges=" .. string(nedges) .. ")"
+ " endif
+ " If !should_accum then necessarily a:prefixlen < len(a:idlist)
+ if should_accum || key == a:idlist[a:prefixlen]
+ let found_match = 1
+ " if !should_accum
+ " echom "found match for key=" .. string(key) .. " at prefixlen=" .. string(a:prefixlen)
+ " endif
+ call s:rd_find_idlist_rec(
+ \ a:idlist, a:prefixlen + 1, a:sharedprefixlen + !should_accum,
+ \ s:rd_read32le(off + 1), a:accum)
+ endif
+ let off += 5
+ endfor
+
+ if !found_match && !should_accum
+ " None of the continuations match, and yet we didn't decide yet that we
+ " should start accumulating, then this is the maximal shared prefix
+ " between the input idlist and the database. Hence we should start
+ " accumulating, which we do by simply re-visiting this node in
+ " accumulation mode.
+ call s:rd_find_idlist_rec(v:null, a:prefixlen, a:sharedprefixlen, a:offset, a:accum)
+ endif
+endfunction
- " if a:idlist is v:null || len(a:idlist) > a:prefixlen
- " const nedges = s:readingdb[off]
+function s:rd_kana_to_id(kana)
+ const kata = get(s:hira2kata, a:kana, a:kana)
+ " Becomes -1 when not found
+ return get(s:kata2id, kata)
+endfunction
- " let off += 1
- " for i in range(nedges)
- " let key = s:readingdb[off]
+" Returns list of hits; see s:rd_find_idlist_rec for what a hit is.
+function s:rd_find_kana(kanastr)
+ let idlist = []
+ for c in a:kanastr
+ call add(idlist, s:rd_kana_to_id(c))
+ endfor
+
+ let accum = []
+ call s:rd_find_idlist_rec(idlist, 0, 0, 0, accum)
+ return accum
+endfunction
+
+function s:rd_score_hit(hit)
+ return a:hit[1] * pow(0.85, a:hit[2] - a:hit[3])
+endfunction
- " let off += 5
- " endfor
- " endif
+" Sorts in-place.
+function s:rd_sort_hits(hits)
+ " right - left results in descending sort.
+ " We don't need exact rounding here; if the scores are <1 apart, they're
+ " basically equal anyway. So float2nr() is enough.
+ call sort(a:hits, {h1, h2 -> float2nr(s:rd_score_hit(h2) - s:rd_score_hit(h1))})
+endfunction
+
+" Returns list of |complete-items| that can replace the kanastr.
+function s:rd_complete_kana(kanastr)
+ const kanacharlen = strcharlen(a:kanastr)
+
+ let hits = s:rd_find_kana(a:kanastr)
+ call s:rd_sort_hits(hits)
+ " echom "hits:" hits
+
+ let result = []
+ for hit in hits
+ " If the shared prefix length is less than the number of input kana, we
+ " have to replace the differing part of the hit with the remaining kana.
+ " We can only do this if the differing part is all-kana.
+ if hit[3] < kanacharlen
+ let prefixlen = strcharlen(hit[0]) - (hit[2] - hit[3])
+ let tail = strcharpart(hit[0], prefixlen)
+ let discard = 0
+ for c in tail
+ if !s:iskana(c)
+ let discard = 1
+ break
+ endif
+ endfor
+ if discard
+ continue
+ endif
+ let complword = strcharpart(hit[0], 0, prefixlen) .. strcharpart(a:kanastr, hit[3])
+ else
+ " The entire input was matched, so we can complete the whole hit.
+ let complword = hit[0]
+ endif
+
+ call add(result, #{word: complword, equal: 1})
+ endfor
+
+ return result
endfunction
@@ -123,7 +243,7 @@ endfunction
function s:renderkana(s, kanamode)
if len(a:s) == 0
- return ""
+ return ''
endif
if a:kanamode == 1
@@ -131,57 +251,69 @@ function s:renderkana(s, kanamode)
elseif a:kanamode == 2
const hira = 0
else
- throw "Invalid kanamode " . a:kanamode
+ throw 'Invalid kanamode ' . a:kanamode
endif
- " echom "renderkana:" a:s strcharpart(a:s, 0, 1) s:isvowel(strcharpart(a:s, 0, 1))
+ " echom 'renderkana:' a:s strcharpart(a:s, 0, 1) s:isvowel(strcharpart(a:s, 0, 1))
- let result = ""
+ let result = ''
let i = 0
while i < strcharlen(a:s)
let c0 = strcharpart(a:s, i, 1)
let c1 = strcharpart(a:s, i+1, 1)
let c2 = strcharpart(a:s, i+2, 1)
- " echom "render:" a:s i c0
+ " echom 'render:' a:s i c0
" vowel kana
if s:isvowel(c0)
- let result ..= hira ? digraph_get(c0 .. "5") : digraph_get(toupper(c0) .. "6")
+ let result ..= hira ? digraph_get(c0 .. '5') : digraph_get(toupper(c0) .. '6')
let i += 1
" consonant-vowel kana
elseif s:isconsonant(c0) && s:isvowel(c1) && index(s:nokana_digraph, c0 .. c1) == -1
let result ..= hira ? digraph_get(c0 .. c1) : digraph_get(toupper(c0) .. c1)
let i += 2
- " v-dakuten kana
- elseif c0 ==# "v" && s:isvowel(c1)
- let kata = digraph_get("V" .. c1)
- let result ..= hira ? s:kata2hira[kata] : kata
- let i += 2
" glide
- elseif s:isconsonant(c0) && c1 ==# "y" && s:isvowel(c2)
- let result ..= hira ? digraph_get(c0 .. "i") : digraph_get(toupper(c0) .. "i")
- let result ..= hira ? digraph_get("y" .. toupper(c2)) : digraph_get("Y" .. toupper(c2))
+ elseif s:isconsonant(c0) && c1 ==# 'y' && s:isvowel(c2)
+ let result ..= hira ? digraph_get(c0 .. 'i') : digraph_get(toupper(c0) .. 'i')
+ let result ..= hira ? digraph_get('y' .. toupper(c2)) : digraph_get('Y' .. toupper(c2))
+ let i += 3
+ " t-h-i shorthand
+ elseif (c0 .. c1 .. c2) == "thi"
+ let result ..= hira ? 'てぃ' : 'ティ'
+ let i += 3
+ " d-h-i shorthand
+ elseif (c0 .. c1 .. c2) == "dhi"
+ let result ..= hira ? 'でぃ' : 'ディ'
let i += 3
" normal n
- elseif c0 ==# "n" && c1 ==# "n"
- let result ..= (hira ? "ん" : "ン")
+ elseif c0 ==# 'n' && c1 ==# 'n'
+ let result ..= (hira ? 'ん' : 'ン')
let i += 2
" implicit n
- elseif c0 ==# "n" && c1 !=# "y" && s:isconsonant(c1)
- let result ..= (hira ? "ん" : "ン")
+ elseif c0 ==# 'n' && c1 !=# 'y' && s:isconsonant(c1)
+ let result ..= (hira ? 'ん' : 'ン')
let i += 1
" geminated consonant
elseif s:isconsonant(c0) && c0 ==# c1
- let result ..= (hira ? "っ" : "ッ")
+ let result ..= (hira ? 'っ' : 'ッ')
let i += 1
+ " v-kana shorthand
+ elseif c0 ==# 'v' && s:isvowel(c1)
+ let result ..= hira ? 'ゔ' .. digraph_get(toupper(c1) .. '5')
+ \ : 'ヴ' .. digraph_get(c1 .. '6')
+ let i += 2
" small vowel
- elseif (c0 ==# "x" || c0 ==# "l") && s:isvowel(c1)
- let result ..= hira ? digraph_get(toupper(c1) .. "5") : digraph_get(c1 .. "6")
+ elseif (c0 ==# 'x' || c0 ==# 'l') && s:isvowel(c1)
+ let result ..= hira ? digraph_get(toupper(c1) .. '5') : digraph_get(c1 .. '6')
let i += 2
+ " small y-kana
+ elseif (c0 ==# 'x' || c0 ==# 'l') && c1 ==# 'y' && index(['a','u','o'], c2) != -1
+ let result ..= hira ? digraph_get('y' .. toupper(c2)) : digraph_get('Y' .. toupper(c2))
+ let i += 3
" sokuon
- elseif c0 == "-"
- let result ..= "ー"
+ elseif c0 == '-'
+ let result ..= 'ー'
let i += 1
" puctuation
elseif has_key(s:punctuation, c0)
@@ -201,7 +333,7 @@ function s:make_region_match(region)
const ln = getline(a:region.y_1)
const prefix = strcharpart(ln, 0, a:region.x1_1 - 1)
const infix = strcharpart(ln, a:region.x1_1 - 1, a:region.x2_1 - a:region.x1_1)
- return matchaddpos("JapaneseVimFragment", [[a:region.y_1, strlen(prefix) + 1, strlen(infix)]])
+ return matchaddpos('JapaneseVimFragment', [[a:region.y_1, strlen(prefix) + 1, strlen(infix)]])
endfunction
" y_1: 1-based line number of the region
@@ -221,10 +353,6 @@ function s:make_region(y_1, x1_1, x2_1, kanamode)
return d
endfunction
-" The fragment currently being typed and formatted.
-" Dictionary as returned by s:make_region.
-let s:cur_region = v:null
-
function s:terminate_region()
if s:cur_region isnot v:null
call matchdelete(s:cur_region.matchid)
@@ -240,14 +368,20 @@ function s:refresh_region_matchid()
const ln = getline(s:cur_region.y_1)
const prefix = strcharpart(ln, 0, s:cur_region.x1_1 - 1)
const infix = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1)
- let s:cur_region.matchid = matchaddpos("JapaneseVimFragment", [[s:cur_region.y_1, strlen(prefix) + 1, strlen(infix)]])
+ let s:cur_region.matchid = matchaddpos('JapaneseVimFragment', [[s:cur_region.y_1, strlen(prefix) + 1, strlen(infix)]])
endfunction
-function s:handle_keypress(key)
- " call s:insert_at_cursor("key " .. a:key)
- const y_1 = line(".")
- const x = charcol(".") - 1
- const fullline = getline(".")
+function s:handle_keypress_expr(key)
+ const escapedkey = a:key == "'" ? "''" : a:key
+ return (pumvisible() ? "\<C-y>\<Cmd>call " . expand('<SID>') . "terminate_region()\<CR>" : '') ..
+ \ "\<Cmd>call " .. expand('<SID>') .. "handle_keypress_cmd('" .. escapedkey .. "')\<CR>"
+endfunction
+
+function s:handle_keypress_cmd(key)
+ " call s:insert_at_cursor('key ' .. a:key)
+ const y_1 = line('.')
+ const x = charcol('.') - 1
+ const fullline = getline('.')
" If the user moved away from the region, start a new region here
if s:cur_region isnot v:null && (y_1 != s:cur_region.y_1 || x + 1 < s:cur_region.x1_1 || x + 1 > s:cur_region.x2_1)
@@ -274,7 +408,7 @@ function s:handle_keypress(key)
" echom y_1 x s:cur_region input result
- call setline(".", strcharpart(fullline, 0, x - input_extra_chars) .. result .. strcharpart(fullline, x))
+ call setline('.', strcharpart(fullline, 0, x - input_extra_chars) .. result .. strcharpart(fullline, x))
const newx_1 = x + 1 - input_extra_chars + strcharlen(result)
call setcursorcharpos(y_1, newx_1)
@@ -288,14 +422,14 @@ endfunction
function s:convert_region_kana(tokanamode)
if s:cur_region is v:null
- const x_1 = charcol(".")
- let s:cur_region = s:make_region(line("."), x_1, x_1, a:tokanamode)
+ const x_1 = charcol('.')
+ let s:cur_region = s:make_region(line('.'), x_1, x_1, a:tokanamode)
return
endif
const ln = getline(s:cur_region.y_1)
const s = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1)
- let result = ""
+ let result = ''
for c in s
" hiragana -> katakana
if a:tokanamode == 2
@@ -318,43 +452,81 @@ function s:convert_region_kana(tokanamode)
let s:cur_region.kanamode = a:tokanamode
endfunction
-function s:handle_escape()
- call s:terminate_region()
- stopinsert
+function s:handle_escape_expr()
+ " <C-y>: accept completion and close completion menu (|complete_CTRL-Y|)
+ return (pumvisible() ? "\<C-y>" : '') ..
+ \ "\<Cmd>call " .. expand('<SID>') .. "terminate_region()\<CR>\<Esc>"
endfunction
-function s:handle_space()
- call s:terminate_region()
- call s:append_at_cursor(" ")
+function s:handle_space_expr()
+ return (pumvisible() ? "\<C-y>" : '') ..
+ \ "\<Cmd>call " .. expand('<SID>') .. "terminate_region()\<CR> "
endfunction
-function s:handle_enter()
- if s:cur_region isnot v:null
- call s:terminate_region()
- else
- " Simulate normal enter
- const y_1 = line(".")
- const xbytes = col(".") - 1
- const ln = getline(".")
- call setline(".", strpart(ln, 0, xbytes))
- call append(y_1, strpart(ln, xbytes))
- call cursor(y_1 + 1, 0)
- endif
+function s:handle_enter_expr()
+ return (pumvisible() ? "\<C-y>" : '') ..
+ \ (s:cur_region isnot v:null ? "\<Cmd>call " .. expand('<SID>') .. "terminate_region()" : '') ..
+ \ "\<CR>"
+endfunction
+
+function s:handle_backspace_expr()
+ " <C-e>: *discard* completion and close completion menu (|complete_CTRL-E|)
+ return (pumvisible() ? "\<C-e>" : '') .. "\<BS>"
endfunction
function s:handle_hiragana()
call s:convert_region_kana(1)
endfunction
-function s:handle_katagana()
+function s:handle_katakana()
call s:convert_region_kana(2)
endfunction
-for c in "abcdefghijklmnopqrstuvwxyz-.,(){}[]/~:!?"
- execute "inoremap <buffer> <silent> " .. c .. " <Cmd>call <SID>handle_keypress(\"" .. c .. "\")<CR>"
+function s:handle_tab_expr()
+ if pumvisible()
+ return "\<C-n>" " next completion item
+ endif
+
+ if s:cur_region is v:null
+ return expand('<SID>') .. "orig\<Tab>" " original tab binding if we're not typing
+ endif
+
+ " Add a <C-n> at the end to immediately select the first completion item
+ return "\<Cmd>call " .. expand('<SID>') .. "handle_tab_completion()\<CR>\<C-n>"
+endfunction
+
+function s:handle_shifttab_expr()
+ if pumvisible()
+ return "\<C-p>" " previous completion item
+ endif
+
+ " return expand('<SID>') .. "orig\<S-Tab>" " original binding
+ return ''
+endfunction
+
+function s:handle_tab_completion()
+ const ln = getline(s:cur_region.y_1)
+ const input = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1)
+ const startcol = strlen(strcharpart(ln, 0, s:cur_region.x1_1 - 1))
+
+ let s:completion_orig_text = input
+
+ " echom "complete_kana on" input
+ const list = s:rd_complete_kana(input)
+ " echom 'completing from ' .. string(startcol) .. ' with ' .. string(list)
+ call complete(startcol + 1, list)
+endfunction
+
+for c in 'abcdefghijklmnopqrstuvwxyz-.,(){}[]/~:!?'
+ execute 'inoremap <buffer> <expr> ' .. c .. ' <SID>handle_keypress_expr(''' .. c .. ''')'
endfor
-inoremap <buffer> <silent> <Esc> <Cmd>call <SID>handle_escape()<CR>
-inoremap <buffer> <silent> <Space> <Cmd>call <SID>handle_space()<CR>
-inoremap <buffer> <silent> <CR> <Cmd>call <SID>handle_enter()<CR>
-inoremap <buffer> <silent> <C-h> <Cmd>call <SID>handle_hiragana()<CR>
-inoremap <buffer> <silent> <C-k> <Cmd>call <SID>handle_katagana()<CR>
+inoremap <buffer> <expr> <Esc> <SID>handle_escape_expr()
+inoremap <buffer> <expr> <Space> <SID>handle_space_expr()
+inoremap <buffer> <expr> <CR> <SID>handle_enter_expr()
+inoremap <buffer> <expr> <BS> <SID>handle_backspace_expr()
+execute 'inoremap <buffer> <SID>orig<Tab> ' .. maparg('<Tab>', 'i')
+inoremap <buffer> <expr> <Tab> <SID>handle_tab_expr()
+execute 'inoremap <buffer> <SID>orig<S-Tab> ' .. maparg('<S-Tab>', 'i')
+inoremap <buffer> <expr> <S-Tab> <SID>handle_shifttab_expr()
+inoremap <buffer> <C-h> <Cmd>call <SID>handle_hiragana()<CR>
+inoremap <buffer> <C-k> <Cmd>call <SID>handle_katakana()<CR>
diff --git a/process-database.hs b/process-database.hs
index 1fa9813..8ce60fd 100644..100755
--- a/process-database.hs
+++ b/process-database.hs
@@ -36,6 +36,10 @@ blockBy n l = case splitAt n l of
hasDuplicates :: Ord a => [a] -> Bool
hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l))
+dedup :: Eq a => [a] -> [a]
+dedup [] = []
+dedup (x:xs) = x : dedup (filter (/= x) xs)
+
data Row i = Row
{ rLexeme :: String
, rOrtho :: String
@@ -54,9 +58,9 @@ readCSV = map (toRow . splitOn ',') . lines
-- (hira->kata, kata->ID, num IDs)
readTable :: String -> (Map Char Char, Map Char Int, Int)
readTable input =
- let list = [(h, k) | [[h], [k]] <- map words (lines input)]
- in (Map.fromList list
- ,Map.fromList (zip (map snd list) [0..])
+ let list = [(h, k) | [h, k] <- map words (lines input)]
+ in (Map.fromList [(h, k) | ([h], [k]) <- list]
+ ,Map.fromList (zip [k | (_, [k]) <- list] [0..])
,length list)
normalise :: Map Char Int -> Row Char -> [Row Int]
@@ -88,7 +92,7 @@ makeTrie prefixlen rows =
longer = Map.fromListWith (++)
[(rReading row !! prefixlen, [row])
| row <- rows, length (rReading row) > prefixlen]
- in Node [(rLexeme r, rFreq r) | r <- here]
+ in Node [(target, rFreq r) | r <- here, target <- dedup [rLexeme r, rOrtho r]]
(Map.assocs (makeTrie (prefixlen + 1) <$> longer))
-- Serialised format of the trie: