From 0e556a7eedb779adfc36db2aecd378eccaaa2420 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sat, 21 Mar 2020 18:55:33 +0100 Subject: Basic ichimoe parser --- ichimoe-get.js | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100755 ichimoe-get.js (limited to 'ichimoe-get.js') diff --git a/ichimoe-get.js b/ichimoe-get.js new file mode 100755 index 0000000..f929db5 --- /dev/null +++ b/ichimoe-get.js @@ -0,0 +1,185 @@ +#!/usr/bin/env node +const https = require("https"); +const util = require("util"); +const cheerio = require("cheerio"); + +function atob(b64) { + return Buffer.from(b64, "base64").toString(); +} + +function normaliseArg(arg) { + if ((arg[0] == '"' || arg[0] == "'") && arg[arg.length - 1] == arg[0]) { + return arg.slice(1, arg.length - 1); + } else { + return arg; + } +} + +function parseDefinitionLI($, li) { + const items = $(li).children(); + let pos = null, desc = null, unknown = []; + + for (let i = 0; i < items.length; i++) { + if (items[i].type != "tag") continue; + const tagname = items[i].name; + const cls = items[i].attribs.class; + if (tagname == "span" && cls == "pos-desc") { + pos = $(items[i]).text().trim(); + } else if (tagname == "span" && cls == "gloss-desc") { + desc = $(items[i]).text().trim(); + } else if (tagname == "span" && cls == "sense-info-note has-tip tip-top") { + // note, let's skip those + } else { + unknown.push("li?? " + tagname + " " + cls); + } + } + + if (unknown.length) return {pos, desc, unknown}; + else return {pos, desc}; +} + +function parseDefinitionsOL($, ol) { + const definitions = []; + const lis = $(ol).children(); + for (let j = 0; j < lis.length; j++) { + definitions.push(parseDefinitionLI($, lis[j])); + } + return definitions; +} + +function parseDL($, dl) { + const alternatives = []; + const items = $(dl).children(); + for (let i = 0; i < items.length/2; i++) { + const dt = items[2*i]; + const dd = items[2*i+1]; + // This replace is not always necessary but it seems reasonably safe + const text = $(dt).text().trim().replace(new RegExp(`^${i+1}\\. `), ""); + const result = parseDefinitionsDD($, dd); + alternatives.push({text, ...result}); + } + return alternatives; +} + +function parseConjugationDiv($, div) { + const items = $(div).children(); + let formdesc = null; + let alternatives = []; + + for (let i = 0; i < items.length; i++) { + if (items[i].type != "tag") continue; + const tagname = items[i].name; + const cls = items[i].attribs.class; + if (tagname == "div" && cls == "conj-prop") { + if (formdesc == null) formdesc = $(items[i]).text().trim().replace(/ +/g, " "); + } else if (tagname == "div" && cls == "conj-gloss") { + const dl = $(items[i]).children()[0]; + alternatives = alternatives.concat(parseDL($, dl)); + } else { + unknown.push("conj?? " + tagname + " " + cls); + } + } + + return { + form: formdesc, + alternatives + }; +} + +function parseConjugationsDiv($, div) { + const conjugations = []; + const divs = $(div).children(); + for (let j = 0; j < divs.length; j++) { + conjugations.push(parseConjugationDiv($, divs[j])); + } + return conjugations; +} + +function parseDefinitionsDD($, dd) { + const items = $(dd).children(); + let definitions = []; + let conjugations = []; + let compounds = []; + let suffix = null; + const unknown = []; + + for (let i = 0; i < items.length; i++) { + if (items[i].type != "tag") continue; + const tagname = items[i].name; + const cls = items[i].attribs.class; + if (tagname == "ol" && cls == "gloss-definitions") { + definitions = definitions.concat(parseDefinitionsOL($, items[i])); + } else if (tagname == "div" && cls == "conjugations") { + conjugations = conjugations.concat(parseConjugationsDiv($, items[i])); + } else if (tagname == "span" && cls == "compound-desc") { + compounds.push({ + desc: $("span.compound-desc-word", items[i]).text().trim(), + parts: null, + }); + } else if (tagname == "dl" && cls == "compounds") { + if (compounds.length > 0 && compounds[compounds.length - 1].parts == null) { + compounds[compounds.length - 1].parts = parseDL($, items[i]); + } + } else if (tagname == "span" && cls == "suffix-desc") { + suffix = $(items[i]).text().trim(); + } else { + unknown.push("dd?? " + tagname + " " + cls); + } + } + + let result = {}; + if (definitions.length) result.definitions = definitions; + if (conjugations.length) result.conjugations = conjugations; + if (compounds.length) result.compounds = compounds; + if (suffix != null) result.suffix = suffix; + if (unknown.length) result.unknown = unknown; + return result; +} + +function parseWordLI($, li) { + const romaji = $($(".gloss-rtext", li)[0]).text().trim(); + const dl = $(".gloss-content > dl.alternatives", li); + const alternatives = parseDL($, dl); + return {romaji, alternatives}; +} + +function parseIchimoeHTML(html) { + const $ = cheerio.load(html); + const words = []; + const rows = $(".gloss-row"); + for (let i = 0; i < rows.length; i++) { + if ($(rows[i]).hasClass("hidden")) continue; + const lis = $($("ul", rows[i])[0]).children(); + for (let j = 0; j < lis.length; j++) { + words.push(parseWordLI($, lis[j])); + } + } + return words; +} + +// const result = parseIchimoeHTML(require("fs").readFileSync("kioku.ichimoe.html")); +// console.log(util.inspect(result, {depth: Infinity, colors: true})); +// process.exit(); + +if (process.argv.length != 3) { + console.error(`Usage: ./ichimoe-get.js <文>`); + process.exit(1); +} + +const word = process.argv[2]; +const url = `https://ichi.moe/cl/qr/?q=${encodeURIComponent(word)}&r=htr`; + +https.get(url, res => { + if (res.statusCode != 200) { + console.error(`Ichi.moe returned status code: ${res.statusCode}`); + console.error(res.headers); + process.exit(1); + } + + let body = ""; + res.on("data", data => body += data); + res.on("end", () => { + const result = parseIchimoeHTML(body); + console.log(util.inspect(result, {depth: Infinity})); + }); +}); -- cgit v1.2.3-54-g00ecf