summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Smeding <tom.smeding@gmail.com>2020-03-21 18:55:33 +0100
committerTom Smeding <tom.smeding@gmail.com>2020-03-21 18:59:18 +0100
commit0e556a7eedb779adfc36db2aecd378eccaaa2420 (patch)
tree721b1d10fe782541163c715bd9f51206d7c298aa
Basic ichimoe parser
-rw-r--r--.gitignore1
-rwxr-xr-xichimoe-get.js185
-rw-r--r--package-lock.json150
-rw-r--r--package.json14
4 files changed, 350 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c2658d7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+node_modules/
diff --git a/ichimoe-get.js b/ichimoe-get.js
new file mode 100755
index 0000000..f929db5
--- /dev/null
+++ b/ichimoe-get.js
@@ -0,0 +1,185 @@
+#!/usr/bin/env node
+const https = require("https");
+const util = require("util");
+const cheerio = require("cheerio");
+
+function atob(b64) {
+ return Buffer.from(b64, "base64").toString();
+}
+
+function normaliseArg(arg) {
+ if ((arg[0] == '"' || arg[0] == "'") && arg[arg.length - 1] == arg[0]) {
+ return arg.slice(1, arg.length - 1);
+ } else {
+ return arg;
+ }
+}
+
+function parseDefinitionLI($, li) {
+ const items = $(li).children();
+ let pos = null, desc = null, unknown = [];
+
+ for (let i = 0; i < items.length; i++) {
+ if (items[i].type != "tag") continue;
+ const tagname = items[i].name;
+ const cls = items[i].attribs.class;
+ if (tagname == "span" && cls == "pos-desc") {
+ pos = $(items[i]).text().trim();
+ } else if (tagname == "span" && cls == "gloss-desc") {
+ desc = $(items[i]).text().trim();
+ } else if (tagname == "span" && cls == "sense-info-note has-tip tip-top") {
+ // note, let's skip those
+ } else {
+ unknown.push("li?? " + tagname + " " + cls);
+ }
+ }
+
+ if (unknown.length) return {pos, desc, unknown};
+ else return {pos, desc};
+}
+
+function parseDefinitionsOL($, ol) {
+ const definitions = [];
+ const lis = $(ol).children();
+ for (let j = 0; j < lis.length; j++) {
+ definitions.push(parseDefinitionLI($, lis[j]));
+ }
+ return definitions;
+}
+
+function parseDL($, dl) {
+ const alternatives = [];
+ const items = $(dl).children();
+ for (let i = 0; i < items.length/2; i++) {
+ const dt = items[2*i];
+ const dd = items[2*i+1];
+ // This replace is not always necessary but it seems reasonably safe
+ const text = $(dt).text().trim().replace(new RegExp(`^${i+1}\\. `), "");
+ const result = parseDefinitionsDD($, dd);
+ alternatives.push({text, ...result});
+ }
+ return alternatives;
+}
+
+function parseConjugationDiv($, div) {
+ const items = $(div).children();
+ let formdesc = null;
+ let alternatives = [];
+
+ for (let i = 0; i < items.length; i++) {
+ if (items[i].type != "tag") continue;
+ const tagname = items[i].name;
+ const cls = items[i].attribs.class;
+ if (tagname == "div" && cls == "conj-prop") {
+ if (formdesc == null) formdesc = $(items[i]).text().trim().replace(/ +/g, " ");
+ } else if (tagname == "div" && cls == "conj-gloss") {
+ const dl = $(items[i]).children()[0];
+ alternatives = alternatives.concat(parseDL($, dl));
+ } else {
+ unknown.push("conj?? " + tagname + " " + cls);
+ }
+ }
+
+ return {
+ form: formdesc,
+ alternatives
+ };
+}
+
+function parseConjugationsDiv($, div) {
+ const conjugations = [];
+ const divs = $(div).children();
+ for (let j = 0; j < divs.length; j++) {
+ conjugations.push(parseConjugationDiv($, divs[j]));
+ }
+ return conjugations;
+}
+
+function parseDefinitionsDD($, dd) {
+ const items = $(dd).children();
+ let definitions = [];
+ let conjugations = [];
+ let compounds = [];
+ let suffix = null;
+ const unknown = [];
+
+ for (let i = 0; i < items.length; i++) {
+ if (items[i].type != "tag") continue;
+ const tagname = items[i].name;
+ const cls = items[i].attribs.class;
+ if (tagname == "ol" && cls == "gloss-definitions") {
+ definitions = definitions.concat(parseDefinitionsOL($, items[i]));
+ } else if (tagname == "div" && cls == "conjugations") {
+ conjugations = conjugations.concat(parseConjugationsDiv($, items[i]));
+ } else if (tagname == "span" && cls == "compound-desc") {
+ compounds.push({
+ desc: $("span.compound-desc-word", items[i]).text().trim(),
+ parts: null,
+ });
+ } else if (tagname == "dl" && cls == "compounds") {
+ if (compounds.length > 0 && compounds[compounds.length - 1].parts == null) {
+ compounds[compounds.length - 1].parts = parseDL($, items[i]);
+ }
+ } else if (tagname == "span" && cls == "suffix-desc") {
+ suffix = $(items[i]).text().trim();
+ } else {
+ unknown.push("dd?? " + tagname + " " + cls);
+ }
+ }
+
+ let result = {};
+ if (definitions.length) result.definitions = definitions;
+ if (conjugations.length) result.conjugations = conjugations;
+ if (compounds.length) result.compounds = compounds;
+ if (suffix != null) result.suffix = suffix;
+ if (unknown.length) result.unknown = unknown;
+ return result;
+}
+
+function parseWordLI($, li) {
+ const romaji = $($(".gloss-rtext", li)[0]).text().trim();
+ const dl = $(".gloss-content > dl.alternatives", li);
+ const alternatives = parseDL($, dl);
+ return {romaji, alternatives};
+}
+
+function parseIchimoeHTML(html) {
+ const $ = cheerio.load(html);
+ const words = [];
+ const rows = $(".gloss-row");
+ for (let i = 0; i < rows.length; i++) {
+ if ($(rows[i]).hasClass("hidden")) continue;
+ const lis = $($("ul", rows[i])[0]).children();
+ for (let j = 0; j < lis.length; j++) {
+ words.push(parseWordLI($, lis[j]));
+ }
+ }
+ return words;
+}
+
+// const result = parseIchimoeHTML(require("fs").readFileSync("kioku.ichimoe.html"));
+// console.log(util.inspect(result, {depth: Infinity, colors: true}));
+// process.exit();
+
+if (process.argv.length != 3) {
+ console.error(`Usage: ./ichimoe-get.js <文>`);
+ process.exit(1);
+}
+
+const word = process.argv[2];
+const url = `https://ichi.moe/cl/qr/?q=${encodeURIComponent(word)}&r=htr`;
+
+https.get(url, res => {
+ if (res.statusCode != 200) {
+ console.error(`Ichi.moe returned status code: ${res.statusCode}`);
+ console.error(res.headers);
+ process.exit(1);
+ }
+
+ let body = "";
+ res.on("data", data => body += data);
+ res.on("end", () => {
+ const result = parseIchimoeHTML(body);
+ console.log(util.inspect(result, {depth: Infinity}));
+ });
+});
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 0000000..a0af348
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,150 @@
+{
+ "name": "japanese-lyrics",
+ "version": "1.0.0",
+ "lockfileVersion": 1,
+ "requires": true,
+ "dependencies": {
+ "@types/node": {
+ "version": "13.7.7",
+ "resolved": "https://registry.npmjs.org/@types/node/-/node-13.7.7.tgz",
+ "integrity": "sha512-Uo4chgKbnPNlxQwoFmYIwctkQVkMMmsAoGGU4JKwLuvBefF0pCq4FybNSnfkfRCpC7ZW7kttcC/TrRtAJsvGtg=="
+ },
+ "boolbase": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
+ "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24="
+ },
+ "cheerio": {
+ "version": "1.0.0-rc.3",
+ "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.3.tgz",
+ "integrity": "sha512-0td5ijfUPuubwLUu0OBoe98gZj8C/AA+RW3v67GPlGOrvxWjZmBXiBCRU+I8VEiNyJzjth40POfHiz2RB3gImA==",
+ "requires": {
+ "css-select": "~1.2.0",
+ "dom-serializer": "~0.1.1",
+ "entities": "~1.1.1",
+ "htmlparser2": "^3.9.1",
+ "lodash": "^4.15.0",
+ "parse5": "^3.0.1"
+ }
+ },
+ "css-select": {
+ "version": "1.2.0",
+ "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz",
+ "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=",
+ "requires": {
+ "boolbase": "~1.0.0",
+ "css-what": "2.1",
+ "domutils": "1.5.1",
+ "nth-check": "~1.0.1"
+ }
+ },
+ "css-what": {
+ "version": "2.1.3",
+ "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz",
+ "integrity": "sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg=="
+ },
+ "dom-serializer": {
+ "version": "0.1.1",
+ "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.1.tgz",
+ "integrity": "sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==",
+ "requires": {
+ "domelementtype": "^1.3.0",
+ "entities": "^1.1.1"
+ }
+ },
+ "domelementtype": {
+ "version": "1.3.1",
+ "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.1.tgz",
+ "integrity": "sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w=="
+ },
+ "domhandler": {
+ "version": "2.4.2",
+ "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz",
+ "integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==",
+ "requires": {
+ "domelementtype": "1"
+ }
+ },
+ "domutils": {
+ "version": "1.5.1",
+ "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz",
+ "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=",
+ "requires": {
+ "dom-serializer": "0",
+ "domelementtype": "1"
+ }
+ },
+ "entities": {
+ "version": "1.1.2",
+ "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
+ "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
+ },
+ "htmlparser2": {
+ "version": "3.10.1",
+ "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.1.tgz",
+ "integrity": "sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==",
+ "requires": {
+ "domelementtype": "^1.3.1",
+ "domhandler": "^2.3.0",
+ "domutils": "^1.5.1",
+ "entities": "^1.1.1",
+ "inherits": "^2.0.1",
+ "readable-stream": "^3.1.1"
+ }
+ },
+ "inherits": {
+ "version": "2.0.4",
+ "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+ "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
+ },
+ "lodash": {
+ "version": "4.17.15",
+ "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz",
+ "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A=="
+ },
+ "nth-check": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz",
+ "integrity": "sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==",
+ "requires": {
+ "boolbase": "~1.0.0"
+ }
+ },
+ "parse5": {
+ "version": "3.0.3",
+ "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz",
+ "integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==",
+ "requires": {
+ "@types/node": "*"
+ }
+ },
+ "readable-stream": {
+ "version": "3.6.0",
+ "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz",
+ "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==",
+ "requires": {
+ "inherits": "^2.0.3",
+ "string_decoder": "^1.1.1",
+ "util-deprecate": "^1.0.1"
+ }
+ },
+ "safe-buffer": {
+ "version": "5.2.0",
+ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.0.tgz",
+ "integrity": "sha512-fZEwUGbVl7kouZs1jCdMLdt95hdIv0ZeHg6L7qPeciMZhZ+/gdesW4wgTARkrFWEpspjEATAzUGPG8N2jJiwbg=="
+ },
+ "string_decoder": {
+ "version": "1.3.0",
+ "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
+ "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
+ "requires": {
+ "safe-buffer": "~5.2.0"
+ }
+ },
+ "util-deprecate": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+ "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8="
+ }
+ }
+}
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..4c64f7a
--- /dev/null
+++ b/package.json
@@ -0,0 +1,14 @@
+{
+ "name": "japanese-lyrics",
+ "version": "1.0.0",
+ "description": "",
+ "main": "ichimoe-get.js",
+ "scripts": {
+ "test": "echo \"Error: no test specified\" && exit 1"
+ },
+ "author": "Tom Smeding <tom.smeding@gmail.com> (https://tomsmeding.com)",
+ "license": "MIT",
+ "dependencies": {
+ "cheerio": "^1.0.0-rc.3"
+ }
+}