From 0e556a7eedb779adfc36db2aecd378eccaaa2420 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sat, 21 Mar 2020 18:55:33 +0100 Subject: Basic ichimoe parser --- .gitignore | 1 + ichimoe-get.js | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ package-lock.json | 150 +++++++++++++++++++++++++++++++++++++++++++ package.json | 14 +++++ 4 files changed, 350 insertions(+) create mode 100644 .gitignore create mode 100755 ichimoe-get.js create mode 100644 package-lock.json create mode 100644 package.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c2658d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules/ diff --git a/ichimoe-get.js b/ichimoe-get.js new file mode 100755 index 0000000..f929db5 --- /dev/null +++ b/ichimoe-get.js @@ -0,0 +1,185 @@ +#!/usr/bin/env node +const https = require("https"); +const util = require("util"); +const cheerio = require("cheerio"); + +function atob(b64) { + return Buffer.from(b64, "base64").toString(); +} + +function normaliseArg(arg) { + if ((arg[0] == '"' || arg[0] == "'") && arg[arg.length - 1] == arg[0]) { + return arg.slice(1, arg.length - 1); + } else { + return arg; + } +} + +function parseDefinitionLI($, li) { + const items = $(li).children(); + let pos = null, desc = null, unknown = []; + + for (let i = 0; i < items.length; i++) { + if (items[i].type != "tag") continue; + const tagname = items[i].name; + const cls = items[i].attribs.class; + if (tagname == "span" && cls == "pos-desc") { + pos = $(items[i]).text().trim(); + } else if (tagname == "span" && cls == "gloss-desc") { + desc = $(items[i]).text().trim(); + } else if (tagname == "span" && cls == "sense-info-note has-tip tip-top") { + // note, let's skip those + } else { + unknown.push("li?? " + tagname + " " + cls); + } + } + + if (unknown.length) return {pos, desc, unknown}; + else return {pos, desc}; +} + +function parseDefinitionsOL($, ol) { + const definitions = []; + const lis = $(ol).children(); + for (let j = 0; j < lis.length; j++) { + definitions.push(parseDefinitionLI($, lis[j])); + } + return definitions; +} + +function parseDL($, dl) { + const alternatives = []; + const items = $(dl).children(); + for (let i = 0; i < items.length/2; i++) { + const dt = items[2*i]; + const dd = items[2*i+1]; + // This replace is not always necessary but it seems reasonably safe + const text = $(dt).text().trim().replace(new RegExp(`^${i+1}\\. `), ""); + const result = parseDefinitionsDD($, dd); + alternatives.push({text, ...result}); + } + return alternatives; +} + +function parseConjugationDiv($, div) { + const items = $(div).children(); + let formdesc = null; + let alternatives = []; + + for (let i = 0; i < items.length; i++) { + if (items[i].type != "tag") continue; + const tagname = items[i].name; + const cls = items[i].attribs.class; + if (tagname == "div" && cls == "conj-prop") { + if (formdesc == null) formdesc = $(items[i]).text().trim().replace(/ +/g, " "); + } else if (tagname == "div" && cls == "conj-gloss") { + const dl = $(items[i]).children()[0]; + alternatives = alternatives.concat(parseDL($, dl)); + } else { + unknown.push("conj?? " + tagname + " " + cls); + } + } + + return { + form: formdesc, + alternatives + }; +} + +function parseConjugationsDiv($, div) { + const conjugations = []; + const divs = $(div).children(); + for (let j = 0; j < divs.length; j++) { + conjugations.push(parseConjugationDiv($, divs[j])); + } + return conjugations; +} + +function parseDefinitionsDD($, dd) { + const items = $(dd).children(); + let definitions = []; + let conjugations = []; + let compounds = []; + let suffix = null; + const unknown = []; + + for (let i = 0; i < items.length; i++) { + if (items[i].type != "tag") continue; + const tagname = items[i].name; + const cls = items[i].attribs.class; + if (tagname == "ol" && cls == "gloss-definitions") { + definitions = definitions.concat(parseDefinitionsOL($, items[i])); + } else if (tagname == "div" && cls == "conjugations") { + conjugations = conjugations.concat(parseConjugationsDiv($, items[i])); + } else if (tagname == "span" && cls == "compound-desc") { + compounds.push({ + desc: $("span.compound-desc-word", items[i]).text().trim(), + parts: null, + }); + } else if (tagname == "dl" && cls == "compounds") { + if (compounds.length > 0 && compounds[compounds.length - 1].parts == null) { + compounds[compounds.length - 1].parts = parseDL($, items[i]); + } + } else if (tagname == "span" && cls == "suffix-desc") { + suffix = $(items[i]).text().trim(); + } else { + unknown.push("dd?? " + tagname + " " + cls); + } + } + + let result = {}; + if (definitions.length) result.definitions = definitions; + if (conjugations.length) result.conjugations = conjugations; + if (compounds.length) result.compounds = compounds; + if (suffix != null) result.suffix = suffix; + if (unknown.length) result.unknown = unknown; + return result; +} + +function parseWordLI($, li) { + const romaji = $($(".gloss-rtext", li)[0]).text().trim(); + const dl = $(".gloss-content > dl.alternatives", li); + const alternatives = parseDL($, dl); + return {romaji, alternatives}; +} + +function parseIchimoeHTML(html) { + const $ = cheerio.load(html); + const words = []; + const rows = $(".gloss-row"); + for (let i = 0; i < rows.length; i++) { + if ($(rows[i]).hasClass("hidden")) continue; + const lis = $($("ul", rows[i])[0]).children(); + for (let j = 0; j < lis.length; j++) { + words.push(parseWordLI($, lis[j])); + } + } + return words; +} + +// const result = parseIchimoeHTML(require("fs").readFileSync("kioku.ichimoe.html")); +// console.log(util.inspect(result, {depth: Infinity, colors: true})); +// process.exit(); + +if (process.argv.length != 3) { + console.error(`Usage: ./ichimoe-get.js <文>`); + process.exit(1); +} + +const word = process.argv[2]; +const url = `https://ichi.moe/cl/qr/?q=${encodeURIComponent(word)}&r=htr`; + +https.get(url, res => { + if (res.statusCode != 200) { + console.error(`Ichi.moe returned status code: ${res.statusCode}`); + console.error(res.headers); + process.exit(1); + } + + let body = ""; + res.on("data", data => body += data); + res.on("end", () => { + const result = parseIchimoeHTML(body); + console.log(util.inspect(result, {depth: Infinity})); + }); +}); diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..a0af348 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,150 @@ +{ + "name": "japanese-lyrics", + "version": "1.0.0", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "@types/node": { + "version": "13.7.7", + "resolved": "https://registry.npmjs.org/@types/node/-/node-13.7.7.tgz", + "integrity": "sha512-Uo4chgKbnPNlxQwoFmYIwctkQVkMMmsAoGGU4JKwLuvBefF0pCq4FybNSnfkfRCpC7ZW7kttcC/TrRtAJsvGtg==" + }, + "boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" + }, + "cheerio": { + "version": "1.0.0-rc.3", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.3.tgz", + "integrity": "sha512-0td5ijfUPuubwLUu0OBoe98gZj8C/AA+RW3v67GPlGOrvxWjZmBXiBCRU+I8VEiNyJzjth40POfHiz2RB3gImA==", + "requires": { + "css-select": "~1.2.0", + "dom-serializer": "~0.1.1", + "entities": "~1.1.1", + "htmlparser2": "^3.9.1", + "lodash": "^4.15.0", + "parse5": "^3.0.1" + } + }, + "css-select": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", + "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=", + "requires": { + "boolbase": "~1.0.0", + "css-what": "2.1", + "domutils": "1.5.1", + "nth-check": "~1.0.1" + } + }, + "css-what": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz", + "integrity": "sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==" + }, + "dom-serializer": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.1.tgz", + "integrity": "sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==", + "requires": { + "domelementtype": "^1.3.0", + "entities": "^1.1.1" + } + }, + "domelementtype": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.1.tgz", + "integrity": "sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==" + }, + "domhandler": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz", + "integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==", + "requires": { + "domelementtype": "1" + } + }, + "domutils": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz", + "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=", + "requires": { + "dom-serializer": "0", + "domelementtype": "1" + } + }, + "entities": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", + "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" + }, + "htmlparser2": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.1.tgz", + "integrity": "sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==", + "requires": { + "domelementtype": "^1.3.1", + "domhandler": "^2.3.0", + "domutils": "^1.5.1", + "entities": "^1.1.1", + "inherits": "^2.0.1", + "readable-stream": "^3.1.1" + } + }, + "inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" + }, + "lodash": { + "version": "4.17.15", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz", + "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==" + }, + "nth-check": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz", + "integrity": "sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==", + "requires": { + "boolbase": "~1.0.0" + } + }, + "parse5": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", + "integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==", + "requires": { + "@types/node": "*" + } + }, + "readable-stream": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", + "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", + "requires": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + } + }, + "safe-buffer": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.0.tgz", + "integrity": "sha512-fZEwUGbVl7kouZs1jCdMLdt95hdIv0ZeHg6L7qPeciMZhZ+/gdesW4wgTARkrFWEpspjEATAzUGPG8N2jJiwbg==" + }, + "string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "requires": { + "safe-buffer": "~5.2.0" + } + }, + "util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..4c64f7a --- /dev/null +++ b/package.json @@ -0,0 +1,14 @@ +{ + "name": "japanese-lyrics", + "version": "1.0.0", + "description": "", + "main": "ichimoe-get.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Tom Smeding (https://tomsmeding.com)", + "license": "MIT", + "dependencies": { + "cheerio": "^1.0.0-rc.3" + } +} -- cgit v1.2.3