From b8d7b530070d45482756a27be367bef5c1b735e0 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sun, 7 Jul 2024 16:53:56 +0200 Subject: Kanji completion basically working --- README.txt | 5 +- japanese.vim | 398 +++++++++++++++++++++++++++++++++++++--------------- process-database.hs | 12 +- 3 files changed, 296 insertions(+), 119 deletions(-) mode change 100644 => 100755 process-database.hs diff --git a/README.txt b/README.txt index c93cbe1..b54816a 100644 --- a/README.txt +++ b/README.txt @@ -12,5 +12,6 @@ Put the result in a CSV (say "database.csv") with 5 columns. It can be ascertained that the data from the spreadsheet does not contain commas in the selected columns, so the CSV conversion is safe. -Then - $ cabal run process-database.hs -- database.csv +Then: + $ ./process-database.hs database.csv database.bin +creates the indexed database file that is read by the vim script. diff --git a/japanese.vim b/japanese.vim index ccd90a5..b2ed95c 100644 --- a/japanese.vim +++ b/japanese.vim @@ -1,79 +1,104 @@ " Convention: If a variable holds a 1-based coordinate, its name is suffixed " with '_1'. +" TODO: +" - Remove the underline when completing, it's the wrong length anyway +" - Make starting completion do something sensible when the cursor is not at +" the end of the current region +" - Fix backspace while completing + " ----------------------------- GENERIC HELPERS ------------------------------- -if !hlexists("JapaneseVimFragment") +if !hlexists('JapaneseVimFragment') hi JapaneseVimFragment cterm=underline gui=underline endif function s:append_at_cursor(text) - const y_1 = line(".") - const xbytes = col(".") - 1 - const ln = getline(".") - call setline(".", strpart(ln, 0, xbytes) .. a:text .. strpart(ln, xbytes)) + const y_1 = line('.') + const xbytes = col('.') - 1 + const ln = getline('.') + call setline('.', strpart(ln, 0, xbytes) .. a:text .. strpart(ln, xbytes)) call cursor(y_1, xbytes + 1 + strlen(a:text)) endfunction function s:isvowel(c) - return strcharlen(a:c) == 1 && stridx("aiueo", a:c) != -1 + return strcharlen(a:c) == 1 && stridx('aiueo', a:c) != -1 endfunction function s:isconsonant(c) - return strcharlen(a:c) == 1 && stridx("kgsztdnhbpmyrw", a:c) != -1 + return strcharlen(a:c) == 1 && stridx('kgsztdnhbpmyrw', a:c) != -1 endfunction " ---------------------------------- DATA ------------------------------------- -const s:nokana_digraph = ["yi", "ye", "wu"] +const s:nokana_digraph = ['yi', 'ye', 'wu'] const s:punctuation = { - \ ".": "。", - \ ",": "、", - \ "(": "(", - \ ")": ")", - \ "{": "{", - \ "}": "}", - \ "[": "「", - \ "]": "」", - \ "/": "・", - \ "~": "〜", - \ ":": ":", - \ "!": "!", - \ "?": "?", + \ '.': '。', + \ ',': '、', + \ '(': '(', + \ ')': ')', + \ '{': '{', + \ '}': '}', + \ '[': '「', + \ ']': '」', + \ '/': '・', + \ '~': '〜', + \ ':': ':', + \ '!': '!', + \ '?': '?', \ } let s:hira2kata = {} let s:kata2hira = {} -let s:hira2id = {} -let s:id2hira = [] +let s:kata2id = {} +let s:id2kata = [] function s:filltables() let i = 0 - for ln in readfile("table.txt") - let words = split(ln, " ") + for ln in readfile('table.txt') + let words = split(ln, ' ') let s:hira2kata[words[0]] = words[1] let s:kata2hira[words[1]] = words[0] - let s:hira2id[words[0]] = i - call add(s:id2hira, words[0]) + let s:kata2id[words[1]] = i + call add(s:id2kata, words[1]) let i += 1 endfor endfunction call s:filltables() lockvar! s:hira2kata lockvar! s:kata2hira -lockvar! s:hira2id -lockvar! s:id2hira +lockvar! s:kata2id +lockvar! s:id2kata try - const s:readingdb = readblob("database.bin") + const s:readingdb = readblob('database.bin') catch - echoe "Reading database not found, make sure you ran process-database.hs" + echoe 'Reading database not found, make sure you ran process-database.hs' endtry +function s:iskana(c) + return has_key(s:hira2kata, a:c) || has_key(s:kata2hira, a:c) +endfunction + + +" ----------------------------- GLOBAL VARIABLES ------------------------------ + +" The fragment currently being typed and formatted. +" Dictionary as returned by s:make_region. +let s:cur_region = v:null + +" Set when starting completion, undefined at other times (may not be reset to +" v:null). +let s:completion_orig_text = v:null + " ------------------------ READING DATABASE FUNCTIONS ------------------------- +" This bound is not adhered to _precisely_, but collection stops soon after +" hitting this limit. +const s:rd_max_num_results = 1000 + function s:rd_read32le(offset) let n = s:readingdb[a:offset] let n += 0x100 * s:readingdb[a:offset+1] @@ -82,40 +107,135 @@ function s:rd_read32le(offset) return n endfunction -" Adds words to the a:accum list. The list is a list of pairs -" [String, Number]: a word with its associated frequency. -" Only words that share a maximal-length prefix with idlist will be added. +" Adds words to the a:accum list. a:accum is a list of hits; a hit is a list: +" [String <- the dictionary lexeme +" ,Number <- frequency recorded in the dictionary +" ,Number <- number of kana in the reading of the lexeme +" ,Number] <- length of shared prefix between reading and given idlist +" Only lexemes that share a maximal-length prefix with idlist are added. " If a:idlist = v:null, everything will be added. -function s:rd_find_idlist_rec(idlist, prefixlen, offset, accum) +function s:rd_find_idlist_rec(idlist, prefixlen, sharedprefixlen, offset, accum) + const should_accum = a:idlist is v:null || a:prefixlen >= len(a:idlist) + const nlexemes = s:readingdb[a:offset] - let off = offset + 1 + let off = a:offset + 1 for i in range(nlexemes) let wordbytes = s:readingdb[off] - if a:idlist is v:null + if should_accum " A blob is converted to a string by interpreting it as UTF-8 when " marshalling it to Lua. Hence we have a convoluted UTF-8 parser right " here. Why does vim not have this built-in in a more sensible way? - let word = luaeval("_A", s:readingdb[off + 1 : off + wordbytes]) + let word = luaeval('_A', s:readingdb[off + 1 : off + wordbytes]) let freq = s:rd_read32le(off + 1 + wordbytes) - call add(a:accum, [word, freq]) + call add(a:accum, [word, freq, a:prefixlen, a:sharedprefixlen]) endif let off += 1 + wordbytes + 4 endfor - " TODO: what exactly should we do here? Under what conditions should we - " recurse in what fashion? + " Prevent the number of results from getting out of hand + if len(a:accum) >= s:rd_max_num_results + return + endif + + const nedges = s:readingdb[off] + + let found_match = 0 + let off += 1 + for i in range(nedges) + let key = s:readingdb[off] + " if a:prefixlen == 1 + " echom "prefixlen=1 key=" .. string(key) .. " (nedges=" .. string(nedges) .. ")" + " endif + " If !should_accum then necessarily a:prefixlen < len(a:idlist) + if should_accum || key == a:idlist[a:prefixlen] + let found_match = 1 + " if !should_accum + " echom "found match for key=" .. string(key) .. " at prefixlen=" .. string(a:prefixlen) + " endif + call s:rd_find_idlist_rec( + \ a:idlist, a:prefixlen + 1, a:sharedprefixlen + !should_accum, + \ s:rd_read32le(off + 1), a:accum) + endif + let off += 5 + endfor + + if !found_match && !should_accum + " None of the continuations match, and yet we didn't decide yet that we + " should start accumulating, then this is the maximal shared prefix + " between the input idlist and the database. Hence we should start + " accumulating, which we do by simply re-visiting this node in + " accumulation mode. + call s:rd_find_idlist_rec(v:null, a:prefixlen, a:sharedprefixlen, a:offset, a:accum) + endif +endfunction - " if a:idlist is v:null || len(a:idlist) > a:prefixlen - " const nedges = s:readingdb[off] +function s:rd_kana_to_id(kana) + const kata = get(s:hira2kata, a:kana, a:kana) + " Becomes -1 when not found + return get(s:kata2id, kata) +endfunction - " let off += 1 - " for i in range(nedges) - " let key = s:readingdb[off] +" Returns list of hits; see s:rd_find_idlist_rec for what a hit is. +function s:rd_find_kana(kanastr) + let idlist = [] + for c in a:kanastr + call add(idlist, s:rd_kana_to_id(c)) + endfor + + let accum = [] + call s:rd_find_idlist_rec(idlist, 0, 0, 0, accum) + return accum +endfunction + +function s:rd_score_hit(hit) + return a:hit[1] * pow(0.85, a:hit[2] - a:hit[3]) +endfunction - " let off += 5 - " endfor - " endif +" Sorts in-place. +function s:rd_sort_hits(hits) + " right - left results in descending sort. + " We don't need exact rounding here; if the scores are <1 apart, they're + " basically equal anyway. So float2nr() is enough. + call sort(a:hits, {h1, h2 -> float2nr(s:rd_score_hit(h2) - s:rd_score_hit(h1))}) +endfunction + +" Returns list of |complete-items| that can replace the kanastr. +function s:rd_complete_kana(kanastr) + const kanacharlen = strcharlen(a:kanastr) + + let hits = s:rd_find_kana(a:kanastr) + call s:rd_sort_hits(hits) + " echom "hits:" hits + + let result = [] + for hit in hits + " If the shared prefix length is less than the number of input kana, we + " have to replace the differing part of the hit with the remaining kana. + " We can only do this if the differing part is all-kana. + if hit[3] < kanacharlen + let prefixlen = strcharlen(hit[0]) - (hit[2] - hit[3]) + let tail = strcharpart(hit[0], prefixlen) + let discard = 0 + for c in tail + if !s:iskana(c) + let discard = 1 + break + endif + endfor + if discard + continue + endif + let complword = strcharpart(hit[0], 0, prefixlen) .. strcharpart(a:kanastr, hit[3]) + else + " The entire input was matched, so we can complete the whole hit. + let complword = hit[0] + endif + + call add(result, #{word: complword, equal: 1}) + endfor + + return result endfunction @@ -123,7 +243,7 @@ endfunction function s:renderkana(s, kanamode) if len(a:s) == 0 - return "" + return '' endif if a:kanamode == 1 @@ -131,57 +251,69 @@ function s:renderkana(s, kanamode) elseif a:kanamode == 2 const hira = 0 else - throw "Invalid kanamode " . a:kanamode + throw 'Invalid kanamode ' . a:kanamode endif - " echom "renderkana:" a:s strcharpart(a:s, 0, 1) s:isvowel(strcharpart(a:s, 0, 1)) + " echom 'renderkana:' a:s strcharpart(a:s, 0, 1) s:isvowel(strcharpart(a:s, 0, 1)) - let result = "" + let result = '' let i = 0 while i < strcharlen(a:s) let c0 = strcharpart(a:s, i, 1) let c1 = strcharpart(a:s, i+1, 1) let c2 = strcharpart(a:s, i+2, 1) - " echom "render:" a:s i c0 + " echom 'render:' a:s i c0 " vowel kana if s:isvowel(c0) - let result ..= hira ? digraph_get(c0 .. "5") : digraph_get(toupper(c0) .. "6") + let result ..= hira ? digraph_get(c0 .. '5') : digraph_get(toupper(c0) .. '6') let i += 1 " consonant-vowel kana elseif s:isconsonant(c0) && s:isvowel(c1) && index(s:nokana_digraph, c0 .. c1) == -1 let result ..= hira ? digraph_get(c0 .. c1) : digraph_get(toupper(c0) .. c1) let i += 2 - " v-dakuten kana - elseif c0 ==# "v" && s:isvowel(c1) - let kata = digraph_get("V" .. c1) - let result ..= hira ? s:kata2hira[kata] : kata - let i += 2 " glide - elseif s:isconsonant(c0) && c1 ==# "y" && s:isvowel(c2) - let result ..= hira ? digraph_get(c0 .. "i") : digraph_get(toupper(c0) .. "i") - let result ..= hira ? digraph_get("y" .. toupper(c2)) : digraph_get("Y" .. toupper(c2)) + elseif s:isconsonant(c0) && c1 ==# 'y' && s:isvowel(c2) + let result ..= hira ? digraph_get(c0 .. 'i') : digraph_get(toupper(c0) .. 'i') + let result ..= hira ? digraph_get('y' .. toupper(c2)) : digraph_get('Y' .. toupper(c2)) + let i += 3 + " t-h-i shorthand + elseif (c0 .. c1 .. c2) == "thi" + let result ..= hira ? 'てぃ' : 'ティ' + let i += 3 + " d-h-i shorthand + elseif (c0 .. c1 .. c2) == "dhi" + let result ..= hira ? 'でぃ' : 'ディ' let i += 3 " normal n - elseif c0 ==# "n" && c1 ==# "n" - let result ..= (hira ? "ん" : "ン") + elseif c0 ==# 'n' && c1 ==# 'n' + let result ..= (hira ? 'ん' : 'ン') let i += 2 " implicit n - elseif c0 ==# "n" && c1 !=# "y" && s:isconsonant(c1) - let result ..= (hira ? "ん" : "ン") + elseif c0 ==# 'n' && c1 !=# 'y' && s:isconsonant(c1) + let result ..= (hira ? 'ん' : 'ン') let i += 1 " geminated consonant elseif s:isconsonant(c0) && c0 ==# c1 - let result ..= (hira ? "っ" : "ッ") + let result ..= (hira ? 'っ' : 'ッ') let i += 1 + " v-kana shorthand + elseif c0 ==# 'v' && s:isvowel(c1) + let result ..= hira ? 'ゔ' .. digraph_get(toupper(c1) .. '5') + \ : 'ヴ' .. digraph_get(c1 .. '6') + let i += 2 " small vowel - elseif (c0 ==# "x" || c0 ==# "l") && s:isvowel(c1) - let result ..= hira ? digraph_get(toupper(c1) .. "5") : digraph_get(c1 .. "6") + elseif (c0 ==# 'x' || c0 ==# 'l') && s:isvowel(c1) + let result ..= hira ? digraph_get(toupper(c1) .. '5') : digraph_get(c1 .. '6') let i += 2 + " small y-kana + elseif (c0 ==# 'x' || c0 ==# 'l') && c1 ==# 'y' && index(['a','u','o'], c2) != -1 + let result ..= hira ? digraph_get('y' .. toupper(c2)) : digraph_get('Y' .. toupper(c2)) + let i += 3 " sokuon - elseif c0 == "-" - let result ..= "ー" + elseif c0 == '-' + let result ..= 'ー' let i += 1 " puctuation elseif has_key(s:punctuation, c0) @@ -201,7 +333,7 @@ function s:make_region_match(region) const ln = getline(a:region.y_1) const prefix = strcharpart(ln, 0, a:region.x1_1 - 1) const infix = strcharpart(ln, a:region.x1_1 - 1, a:region.x2_1 - a:region.x1_1) - return matchaddpos("JapaneseVimFragment", [[a:region.y_1, strlen(prefix) + 1, strlen(infix)]]) + return matchaddpos('JapaneseVimFragment', [[a:region.y_1, strlen(prefix) + 1, strlen(infix)]]) endfunction " y_1: 1-based line number of the region @@ -221,10 +353,6 @@ function s:make_region(y_1, x1_1, x2_1, kanamode) return d endfunction -" The fragment currently being typed and formatted. -" Dictionary as returned by s:make_region. -let s:cur_region = v:null - function s:terminate_region() if s:cur_region isnot v:null call matchdelete(s:cur_region.matchid) @@ -240,14 +368,20 @@ function s:refresh_region_matchid() const ln = getline(s:cur_region.y_1) const prefix = strcharpart(ln, 0, s:cur_region.x1_1 - 1) const infix = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1) - let s:cur_region.matchid = matchaddpos("JapaneseVimFragment", [[s:cur_region.y_1, strlen(prefix) + 1, strlen(infix)]]) + let s:cur_region.matchid = matchaddpos('JapaneseVimFragment', [[s:cur_region.y_1, strlen(prefix) + 1, strlen(infix)]]) endfunction -function s:handle_keypress(key) - " call s:insert_at_cursor("key " .. a:key) - const y_1 = line(".") - const x = charcol(".") - 1 - const fullline = getline(".") +function s:handle_keypress_expr(key) + const escapedkey = a:key == "'" ? "''" : a:key + return (pumvisible() ? "\\call " . expand('') . "terminate_region()\" : '') .. + \ "\call " .. expand('') .. "handle_keypress_cmd('" .. escapedkey .. "')\" +endfunction + +function s:handle_keypress_cmd(key) + " call s:insert_at_cursor('key ' .. a:key) + const y_1 = line('.') + const x = charcol('.') - 1 + const fullline = getline('.') " If the user moved away from the region, start a new region here if s:cur_region isnot v:null && (y_1 != s:cur_region.y_1 || x + 1 < s:cur_region.x1_1 || x + 1 > s:cur_region.x2_1) @@ -274,7 +408,7 @@ function s:handle_keypress(key) " echom y_1 x s:cur_region input result - call setline(".", strcharpart(fullline, 0, x - input_extra_chars) .. result .. strcharpart(fullline, x)) + call setline('.', strcharpart(fullline, 0, x - input_extra_chars) .. result .. strcharpart(fullline, x)) const newx_1 = x + 1 - input_extra_chars + strcharlen(result) call setcursorcharpos(y_1, newx_1) @@ -288,14 +422,14 @@ endfunction function s:convert_region_kana(tokanamode) if s:cur_region is v:null - const x_1 = charcol(".") - let s:cur_region = s:make_region(line("."), x_1, x_1, a:tokanamode) + const x_1 = charcol('.') + let s:cur_region = s:make_region(line('.'), x_1, x_1, a:tokanamode) return endif const ln = getline(s:cur_region.y_1) const s = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1) - let result = "" + let result = '' for c in s " hiragana -> katakana if a:tokanamode == 2 @@ -318,43 +452,81 @@ function s:convert_region_kana(tokanamode) let s:cur_region.kanamode = a:tokanamode endfunction -function s:handle_escape() - call s:terminate_region() - stopinsert +function s:handle_escape_expr() + " : accept completion and close completion menu (|complete_CTRL-Y|) + return (pumvisible() ? "\" : '') .. + \ "\call " .. expand('') .. "terminate_region()\\" endfunction -function s:handle_space() - call s:terminate_region() - call s:append_at_cursor(" ") +function s:handle_space_expr() + return (pumvisible() ? "\" : '') .. + \ "\call " .. expand('') .. "terminate_region()\ " endfunction -function s:handle_enter() - if s:cur_region isnot v:null - call s:terminate_region() - else - " Simulate normal enter - const y_1 = line(".") - const xbytes = col(".") - 1 - const ln = getline(".") - call setline(".", strpart(ln, 0, xbytes)) - call append(y_1, strpart(ln, xbytes)) - call cursor(y_1 + 1, 0) - endif +function s:handle_enter_expr() + return (pumvisible() ? "\" : '') .. + \ (s:cur_region isnot v:null ? "\call " .. expand('') .. "terminate_region()" : '') .. + \ "\" +endfunction + +function s:handle_backspace_expr() + " : *discard* completion and close completion menu (|complete_CTRL-E|) + return (pumvisible() ? "\" : '') .. "\" endfunction function s:handle_hiragana() call s:convert_region_kana(1) endfunction -function s:handle_katagana() +function s:handle_katakana() call s:convert_region_kana(2) endfunction -for c in "abcdefghijklmnopqrstuvwxyz-.,(){}[]/~:!?" - execute "inoremap " .. c .. " call handle_keypress(\"" .. c .. "\")" +function s:handle_tab_expr() + if pumvisible() + return "\" " next completion item + endif + + if s:cur_region is v:null + return expand('') .. "orig\" " original tab binding if we're not typing + endif + + " Add a at the end to immediately select the first completion item + return "\call " .. expand('') .. "handle_tab_completion()\\" +endfunction + +function s:handle_shifttab_expr() + if pumvisible() + return "\" " previous completion item + endif + + " return expand('') .. "orig\" " original binding + return '' +endfunction + +function s:handle_tab_completion() + const ln = getline(s:cur_region.y_1) + const input = strcharpart(ln, s:cur_region.x1_1 - 1, s:cur_region.x2_1 - s:cur_region.x1_1) + const startcol = strlen(strcharpart(ln, 0, s:cur_region.x1_1 - 1)) + + let s:completion_orig_text = input + + " echom "complete_kana on" input + const list = s:rd_complete_kana(input) + " echom 'completing from ' .. string(startcol) .. ' with ' .. string(list) + call complete(startcol + 1, list) +endfunction + +for c in 'abcdefghijklmnopqrstuvwxyz-.,(){}[]/~:!?' + execute 'inoremap ' .. c .. ' handle_keypress_expr(''' .. c .. ''')' endfor -inoremap call handle_escape() -inoremap call handle_space() -inoremap call handle_enter() -inoremap call handle_hiragana() -inoremap call handle_katagana() +inoremap handle_escape_expr() +inoremap handle_space_expr() +inoremap handle_enter_expr() +inoremap handle_backspace_expr() +execute 'inoremap orig ' .. maparg('', 'i') +inoremap handle_tab_expr() +execute 'inoremap orig ' .. maparg('', 'i') +inoremap handle_shifttab_expr() +inoremap call handle_hiragana() +inoremap call handle_katakana() diff --git a/process-database.hs b/process-database.hs old mode 100644 new mode 100755 index 1fa9813..8ce60fd --- a/process-database.hs +++ b/process-database.hs @@ -36,6 +36,10 @@ blockBy n l = case splitAt n l of hasDuplicates :: Ord a => [a] -> Bool hasDuplicates (sort -> l) = any (uncurry (==)) (zip l (drop 1 l)) +dedup :: Eq a => [a] -> [a] +dedup [] = [] +dedup (x:xs) = x : dedup (filter (/= x) xs) + data Row i = Row { rLexeme :: String , rOrtho :: String @@ -54,9 +58,9 @@ readCSV = map (toRow . splitOn ',') . lines -- (hira->kata, kata->ID, num IDs) readTable :: String -> (Map Char Char, Map Char Int, Int) readTable input = - let list = [(h, k) | [[h], [k]] <- map words (lines input)] - in (Map.fromList list - ,Map.fromList (zip (map snd list) [0..]) + let list = [(h, k) | [h, k] <- map words (lines input)] + in (Map.fromList [(h, k) | ([h], [k]) <- list] + ,Map.fromList (zip [k | (_, [k]) <- list] [0..]) ,length list) normalise :: Map Char Int -> Row Char -> [Row Int] @@ -88,7 +92,7 @@ makeTrie prefixlen rows = longer = Map.fromListWith (++) [(rReading row !! prefixlen, [row]) | row <- rows, length (rReading row) > prefixlen] - in Node [(rLexeme r, rFreq r) | r <- here] + in Node [(target, rFreq r) | r <- here, target <- dedup [rLexeme r, rOrtho r]] (Map.assocs (makeTrie (prefixlen + 1) <$> longer)) -- Serialised format of the trie: -- cgit v1.2.3-70-g09d2