summaryrefslogtreecommitdiff
path: root/ichimoe-get.js
diff options
context:
space:
mode:
Diffstat (limited to 'ichimoe-get.js')
-rwxr-xr-xichimoe-get.js185
1 files changed, 185 insertions, 0 deletions
diff --git a/ichimoe-get.js b/ichimoe-get.js
new file mode 100755
index 0000000..f929db5
--- /dev/null
+++ b/ichimoe-get.js
@@ -0,0 +1,185 @@
+#!/usr/bin/env node
+const https = require("https");
+const util = require("util");
+const cheerio = require("cheerio");
+
+function atob(b64) {
+ return Buffer.from(b64, "base64").toString();
+}
+
+function normaliseArg(arg) {
+ if ((arg[0] == '"' || arg[0] == "'") && arg[arg.length - 1] == arg[0]) {
+ return arg.slice(1, arg.length - 1);
+ } else {
+ return arg;
+ }
+}
+
+function parseDefinitionLI($, li) {
+ const items = $(li).children();
+ let pos = null, desc = null, unknown = [];
+
+ for (let i = 0; i < items.length; i++) {
+ if (items[i].type != "tag") continue;
+ const tagname = items[i].name;
+ const cls = items[i].attribs.class;
+ if (tagname == "span" && cls == "pos-desc") {
+ pos = $(items[i]).text().trim();
+ } else if (tagname == "span" && cls == "gloss-desc") {
+ desc = $(items[i]).text().trim();
+ } else if (tagname == "span" && cls == "sense-info-note has-tip tip-top") {
+ // note, let's skip those
+ } else {
+ unknown.push("li?? " + tagname + " " + cls);
+ }
+ }
+
+ if (unknown.length) return {pos, desc, unknown};
+ else return {pos, desc};
+}
+
+function parseDefinitionsOL($, ol) {
+ const definitions = [];
+ const lis = $(ol).children();
+ for (let j = 0; j < lis.length; j++) {
+ definitions.push(parseDefinitionLI($, lis[j]));
+ }
+ return definitions;
+}
+
+function parseDL($, dl) {
+ const alternatives = [];
+ const items = $(dl).children();
+ for (let i = 0; i < items.length/2; i++) {
+ const dt = items[2*i];
+ const dd = items[2*i+1];
+ // This replace is not always necessary but it seems reasonably safe
+ const text = $(dt).text().trim().replace(new RegExp(`^${i+1}\\. `), "");
+ const result = parseDefinitionsDD($, dd);
+ alternatives.push({text, ...result});
+ }
+ return alternatives;
+}
+
+function parseConjugationDiv($, div) {
+ const items = $(div).children();
+ let formdesc = null;
+ let alternatives = [];
+
+ for (let i = 0; i < items.length; i++) {
+ if (items[i].type != "tag") continue;
+ const tagname = items[i].name;
+ const cls = items[i].attribs.class;
+ if (tagname == "div" && cls == "conj-prop") {
+ if (formdesc == null) formdesc = $(items[i]).text().trim().replace(/ +/g, " ");
+ } else if (tagname == "div" && cls == "conj-gloss") {
+ const dl = $(items[i]).children()[0];
+ alternatives = alternatives.concat(parseDL($, dl));
+ } else {
+ unknown.push("conj?? " + tagname + " " + cls);
+ }
+ }
+
+ return {
+ form: formdesc,
+ alternatives
+ };
+}
+
+function parseConjugationsDiv($, div) {
+ const conjugations = [];
+ const divs = $(div).children();
+ for (let j = 0; j < divs.length; j++) {
+ conjugations.push(parseConjugationDiv($, divs[j]));
+ }
+ return conjugations;
+}
+
+function parseDefinitionsDD($, dd) {
+ const items = $(dd).children();
+ let definitions = [];
+ let conjugations = [];
+ let compounds = [];
+ let suffix = null;
+ const unknown = [];
+
+ for (let i = 0; i < items.length; i++) {
+ if (items[i].type != "tag") continue;
+ const tagname = items[i].name;
+ const cls = items[i].attribs.class;
+ if (tagname == "ol" && cls == "gloss-definitions") {
+ definitions = definitions.concat(parseDefinitionsOL($, items[i]));
+ } else if (tagname == "div" && cls == "conjugations") {
+ conjugations = conjugations.concat(parseConjugationsDiv($, items[i]));
+ } else if (tagname == "span" && cls == "compound-desc") {
+ compounds.push({
+ desc: $("span.compound-desc-word", items[i]).text().trim(),
+ parts: null,
+ });
+ } else if (tagname == "dl" && cls == "compounds") {
+ if (compounds.length > 0 && compounds[compounds.length - 1].parts == null) {
+ compounds[compounds.length - 1].parts = parseDL($, items[i]);
+ }
+ } else if (tagname == "span" && cls == "suffix-desc") {
+ suffix = $(items[i]).text().trim();
+ } else {
+ unknown.push("dd?? " + tagname + " " + cls);
+ }
+ }
+
+ let result = {};
+ if (definitions.length) result.definitions = definitions;
+ if (conjugations.length) result.conjugations = conjugations;
+ if (compounds.length) result.compounds = compounds;
+ if (suffix != null) result.suffix = suffix;
+ if (unknown.length) result.unknown = unknown;
+ return result;
+}
+
+function parseWordLI($, li) {
+ const romaji = $($(".gloss-rtext", li)[0]).text().trim();
+ const dl = $(".gloss-content > dl.alternatives", li);
+ const alternatives = parseDL($, dl);
+ return {romaji, alternatives};
+}
+
+function parseIchimoeHTML(html) {
+ const $ = cheerio.load(html);
+ const words = [];
+ const rows = $(".gloss-row");
+ for (let i = 0; i < rows.length; i++) {
+ if ($(rows[i]).hasClass("hidden")) continue;
+ const lis = $($("ul", rows[i])[0]).children();
+ for (let j = 0; j < lis.length; j++) {
+ words.push(parseWordLI($, lis[j]));
+ }
+ }
+ return words;
+}
+
+// const result = parseIchimoeHTML(require("fs").readFileSync("kioku.ichimoe.html"));
+// console.log(util.inspect(result, {depth: Infinity, colors: true}));
+// process.exit();
+
+if (process.argv.length != 3) {
+ console.error(`Usage: ./ichimoe-get.js <文>`);
+ process.exit(1);
+}
+
+const word = process.argv[2];
+const url = `https://ichi.moe/cl/qr/?q=${encodeURIComponent(word)}&r=htr`;
+
+https.get(url, res => {
+ if (res.statusCode != 200) {
+ console.error(`Ichi.moe returned status code: ${res.statusCode}`);
+ console.error(res.headers);
+ process.exit(1);
+ }
+
+ let body = "";
+ res.on("data", data => body += data);
+ res.on("end", () => {
+ const result = parseIchimoeHTML(body);
+ console.log(util.inspect(result, {depth: Infinity}));
+ });
+});