#!/usr/bin/env node const https = require("https"); const fs = require("fs"); const crypto = require("crypto"); const util = require("util"); const cheerio = require("cheerio"); function atob(b64) { return Buffer.from(b64, "base64").toString(); } function normaliseArg(arg) { if ((arg[0] == '"' || arg[0] == "'") && arg[arg.length - 1] == arg[0]) { return arg.slice(1, arg.length - 1); } else { return arg; } } function parseDefinitionLI($, li) { const items = $(li).children(); let pos = null, desc = null, unknown = []; for (let i = 0; i < items.length; i++) { if (items[i].type != "tag") continue; const tagname = items[i].name; const cls = items[i].attribs.class; if (tagname == "span" && cls == "pos-desc") { pos = $(items[i]).text().trim(); } else if (tagname == "span" && cls == "gloss-desc") { desc = $(items[i]).text().trim(); } else if (tagname == "span" && cls == "sense-info-note has-tip tip-top") { // note, let's skip those } else { unknown.push("li?? " + tagname + " " + cls); } } if (unknown.length) return {pos, desc, unknown}; else return {pos, desc}; } function parseDefinitionsOL($, ol) { const definitions = []; const lis = $(ol).children(); for (let j = 0; j < lis.length; j++) { definitions.push(parseDefinitionLI($, lis[j])); } return definitions; } function parseDL($, dl) { const alternatives = []; const items = $(dl).children(); for (let i = 0; i < items.length/2; i++) { const dt = items[2*i]; const dd = items[2*i+1]; // This replace is not always necessary but it seems reasonably safe const text = $(dt).text().trim().replace(new RegExp(`^${i+1}\\. `), ""); const result = parseDefinitionsDD($, dd); alternatives.push({text, ...result}); } return alternatives; } function parseConjugationDiv($, div) { const items = $(div).children(); let formdesc = null; let alternatives = []; const unknown = []; for (let i = 0; i < items.length; i++) { if (items[i].type != "tag") continue; const tagname = items[i].name; const cls = items[i].attribs.class; if (tagname == "div" && cls == "conj-prop") { if (formdesc == null) formdesc = $(items[i]).text().trim().replace(/ +/g, " "); } else if (tagname == "div" && cls == "conj-gloss") { const dl = $(items[i]).children()[0]; alternatives = alternatives.concat(parseDL($, dl)); } else { unknown.push("conj?? " + tagname + " " + cls); } } let result = {form: formdesc, alternatives}; if (unknown.length) result.unknown = unknown; return result; } function parseConjugationsDiv($, div) { const conjugations = []; const divs = $(div).children(); for (let j = 0; j < divs.length; j++) { conjugations.push(parseConjugationDiv($, divs[j])); } return conjugations; } function parseDefinitionsDD($, dd) { const items = $(dd).children(); let definitions = []; let conjugations = []; let compounds = []; let suffix = null; const unknown = []; for (let i = 0; i < items.length; i++) { if (items[i].type != "tag") continue; const tagname = items[i].name; const cls = items[i].attribs.class; if (tagname == "ol" && cls == "gloss-definitions") { definitions = definitions.concat(parseDefinitionsOL($, items[i])); } else if (tagname == "div" && cls == "conjugations") { conjugations = conjugations.concat(parseConjugationsDiv($, items[i])); } else if (tagname == "span" && cls == "compound-desc") { compounds.push({ desc: $("span.compound-desc-word", items[i]).text().trim(), parts: null, }); } else if (tagname == "dl" && cls == "compounds") { if (compounds.length > 0 && compounds[compounds.length - 1].parts == null) { compounds[compounds.length - 1].parts = parseDL($, items[i]); } } else if (tagname == "span" && cls == "suffix-desc") { suffix = $(items[i]).text().trim(); } else { unknown.push("dd?? " + tagname + " " + cls); } } let result = {}; if (definitions.length) result.definitions = definitions; if (conjugations.length) result.conjugations = conjugations; if (compounds.length) result.compounds = compounds; if (suffix != null) result.suffix = suffix; if (unknown.length) result.unknown = unknown; return result; } function parseWordLI($, li) { const div = $(".gloss-rtext", li)[0]; const links = $(".info-link", div); const romaji = links.length > 0 ? $(links[0]).text().trim() : $(div).text().trim(); const dl = $(".gloss-content > dl.alternatives", li); const alternatives = parseDL($, dl); return {romaji, alternatives}; } function parseIchimoeHTML(html) { const $ = cheerio.load(html); const words = []; const rows = $(".gloss-row"); for (let i = 0; i < rows.length; i++) { if ($(rows[i]).hasClass("hidden")) continue; const lis = $($("ul", rows[i])[0]).children(); for (let j = 0; j < lis.length; j++) { words.push(parseWordLI($, lis[j])); } } return words; } function cachedHTTPSGet(url, cb) { const id = crypto.createHash("sha256").update(url).digest("hex"); const dir = __dirname + "/.ichimoe-cache"; const path = dir + "/" + id + ".html"; function doGet() { https.get(url, res => { if (res.statusCode != 200) { console.error(`Ichi.moe returned status code: ${res.statusCode}`); console.error(res.headers); process.exit(1); } let body = ""; res.on("data", data => body += data); res.on("end", () => { fs.writeFile(path, body, err => { if (err) throw err; cb(body); }); }); }); } fs.mkdir(dir, err => { if (err && err.code != "EEXIST") throw err; fs.readFile(path, "utf8", (err, data) => { if (err) { if (err.code == "ENOENT") doGet(); else throw err; } else { cb(data); } }); }); } // const result = parseIchimoeHTML(require("fs").readFileSync("kioku.ichimoe.html")); // console.log(util.inspect(result, {depth: Infinity, colors: true})); // process.exit(); if (process.argv.length != 3) { console.error(`Usage: ./ichimoe-get.js <文>`); process.exit(1); } const word = process.argv[2]; const url = `https://ichi.moe/cl/qr/?q=${encodeURIComponent(word)}&r=htr`; cachedHTTPSGet(url, body => { const result = parseIchimoeHTML(body); if (process.stdout.isTTY) { console.log(util.inspect(result, {depth: Infinity})); } else { console.log(JSON.stringify(result)); } });