diff --git a/.gitignore b/.gitignore index 685fbfe..7bed457 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ node_modules/ src/output/html/*.html -src/output/log/ \ No newline at end of file +src/output/log/ +src/output/audios/ \ No newline at end of file diff --git a/src/main-worker.js b/src/main-worker.js index 58b089d..da9db41 100644 --- a/src/main-worker.js +++ b/src/main-worker.js @@ -5,10 +5,12 @@ const path = require("path"); const { Worker, isMainThread, workerData } = require("worker_threads"); const writeToLog = require("./utils/logger"); const DEFAULT_WORKER_NUM = 8; +const fetchAudios = require("./utils/audio_downloader"); // define directory path and file path const directoryPath = __dirname; -const filePath = path.join(__dirname, "words.txt"); +const DEFAULT_WORDLIST_PATH = path.join(__dirname, "words.txt"); +const DEFAULT_HTML_DIR = `${directoryPath}/output/html/`; // define URL prefix const urlString = @@ -40,6 +42,9 @@ async function processWord(word) { const get = bent(urlString, "GET", "string", 200); const data = await get(`/` + word); const $ = cheerio.load(data); + let audioDIVsBuffer = []; + fetchAudios($, word, audioDIVsBuffer); + const audioDIVsString = audioDIVsBuffer.join(""); // remove unusable icon and hint nodes $(".daud").remove(); $(".i.i-caret-right.dtrans.fs18.lpb-4").remove(); @@ -52,17 +57,20 @@ async function processWord(word) { $("a[href*='dictionary'] span").removeClass("dx-h"); $("a[href*='dictionary']").removeAttr("href"); const target = - `` + + `` + $(".entry-body").html() + - ``; + "
" + + audioDIVsString + + "
"; fs.writeFile( - `${directoryPath}/output/html/${word}.html`, + `${DEFAULT_HTML_DIR}${word}.html`, target, (err) => { if (err) { - console.log(`[ERR] While writing [${word}]:`, err); - } else { - console.log(`[OK] Word [${word}] is written.`); + writeToLog( + `[ERR] WHILE WRITING [${word}]: ${err.message}` + ); + writeToLog(`[OK] WORD [${word}] WRITTEN.`); } } ); @@ -73,9 +81,8 @@ async function processWord(word) { word + "\n", (err) => { if (err) { - console.error( - `[LOG-ERR] WHILE LOGGING MISSING OF [${word}]:`, - err.message + writeToLog( + `[LOG-ERR] WHILE LOGGING MISSING of [${word}]:${err.message}` ); } } @@ -86,7 +93,7 @@ async function processWord(word) { async function workerExecution() { if (isMainThread) { // read file and process the token of each line - fs.readFile(filePath, "utf8", async (err, data) => { + fs.readFile(DEFAULT_WORDLIST_PATH, "utf8", async (err, data) => { if (err) { writeToLog(`[ERR] ERROR READING FILE : ${err.message}`); return; @@ -110,7 +117,7 @@ async function workerExecution() { }); worker.on("exit", () => { - writeToLog(`[INF] WORKER-${i} EXIT.`); + writeToLog(`[INFO] WORKER-${i} EXIT.`); }); workers.push(worker); @@ -122,7 +129,7 @@ async function workerExecution() { worker.on("error", reject); }); } - writeToLog("[INF] ALL WORKERS FINISHED."); + writeToLog("[INFO] ALL WORKERS FINISHED."); }); } else { const words = workerData; diff --git a/src/main.js b/src/main.js index 622a598..4bd6a3e 100644 --- a/src/main.js +++ b/src/main.js @@ -1,107 +1,121 @@ /* - * Copyright (c) 2023 @Quasimurdock - * - * Portions of this file are subject to the Apache License, Version 2.0 ("License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * */ + * Copyright (c) 2023 @Quasimurdock + * + * Portions of this file are subject to the Apache License, Version 2.0 ("License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * */ const fs = require("fs"); const bent = require("bent"); const cheerio = require("cheerio"); const path = require("path"); +const writeToLog = require("./utils/logger"); +const fetchAudios = require("./utils/audio_downloader"); // define directory path and file path const directoryPath = __dirname; -const filePath = path.join(__dirname, "words.txt"); +const DEFAULT_WORDLIST_PATH = path.join(__dirname, "words.txt"); +const DEFAULT_HTML_DIR = `${directoryPath}/output/html/`; // define URL prefix const urlString = - "https://dictionary.cambridge.org/dictionary/english-chinese-simplified"; + "https://dictionary.cambridge.org/dictionary/english-chinese-simplified"; function checkDuplicateFiles(directory, filename) { - const files = fs.readdirSync(directory); // read all files under the directory - for (let file of files) { - if (file === filename) { - return true; // duplicated files existed - } - } - return false; + const files = fs.readdirSync(directory); // read all files under the directory + for (let file of files) { + if (file === filename) { + return true; // duplicated files existed + } + } + return false; } // read file and process the token of each line -fs.readFile(filePath, "utf8", async (err, data) => { - if (err) { - console.error("Error reading file:", err); - return; - } +fs.readFile(DEFAULT_WORDLIST_PATH, "utf8", async (err, data) => { + if (err) { + console.error("ERROR READING FILE:", err); + writeToLog(`[ERR] FAILED TO READ FILE: ${err.message}`); + return; + } - const words = data.split("\n"); - for (let word of words) { - word = word.trim(); - // judge if empty - if (word === "" || word.length == 0) { - continue; - } + const words = data.split("\n"); + for (let word of words) { + word = word.trim(); + // judge if empty + if (word === "" || word.length == 0) { + continue; + } - // check if duplicated file exists - if (checkDuplicateFiles(directoryPath + "/output/html", word + ".html")) { - console.log(`[SKIP] Duplicated [${word}] file exists.`); - continue; - } - // concat strings - try { - const get = bent(urlString, "GET", "string", 200); - const data = await get(`/` + word); - const $ = cheerio.load(data); - // remove unusable icon and hint nodes - $(".daud").remove(); - $(".i.i-caret-right.dtrans.fs18.lpb-4").remove(); - $(".dwla.wordlist-add-button").remove(); - $(".hfr.lpb-2").remove(); - $("i.i-plus.ca_hi").remove(); - $("script").remove(); - // remove all dictionary href links from nodes with - // and underlines of its sub nodes - $("a[href*='dictionary'] span").removeClass("dx-h"); - $("a[href*='dictionary']").removeAttr("href"); - const target = - `` + - $(".entry-body").html() + - ``; - fs.writeFile( - `${directoryPath}/output/html/${word}.html`, - target, - (err) => { - if (err) { - console.log(`[ERR] While writing [${word}]:`, err); - } else { - console.log(`[OK] Word [${word}] is written.`); - } - } - ); - } catch (error) { - const fetchUrl = urlString + "/" + word; - console.error(`[ERR] While finding ${word}:`, fetchUrl); - console.error(error); - fs.appendFile( - `${directoryPath}/output/log/missing_words.txt`, - word + "\n", - (err) => { - if (err) { - console.log(`[LOG-ERR] While logging missing of [${word}]:`, err); - } else { - console.log(`[LOG-OK] Missing of word [${word}] is logged.`); - } - } - ); - } - } + // check if duplicated file exists + if ( + checkDuplicateFiles(directoryPath + "/output/html", word + ".html") + ) { + console.log(`[SKIP] Duplicated [${word}] file exists.`); + continue; + } + // concat strings + try { + const get = bent(urlString, "GET", "string", 200); + const data = await get(`/` + word); + const $ = cheerio.load(data); + let audioDIVsBuffer = []; + fetchAudios($, word, audioDIVsBuffer); + const audioDIVsString = audioDIVsBuffer.join(""); + // remove unusable icon and hint nodes + $(".daud").remove(); + $(".i.i-caret-right.dtrans.fs18.lpb-4").remove(); + $(".dwla.wordlist-add-button").remove(); + $(".hfr.lpb-2").remove(); + $("i.i-plus.ca_hi").remove(); + $("script").remove(); + // remove all dictionary href links from nodes with + // and underlines of its sub nodes + $("a[href*='dictionary'] span").removeClass("dx-h"); + $("a[href*='dictionary']").removeAttr("href"); + const target = + `` + + $(".entry-body").html() + + "
" + + audioDIVsString + + "
"; + fs.writeFile( + `${DEFAULT_HTML_DIR}${word}.html`, + target, + (err) => { + if (err) { + writeToLog( + `[ERR] WHILE WRITING [${word}]: ${err.message}` + ); + } else { + writeToLog(`[OK] WORD [${word}] WRITTEN.`); + } + } + ); + } catch (err) { + const fetchUrl = urlString + "/" + word; + writeToLog( + `[ERR] WHILE FINDING ${word}: ${fetchUrl} ${err.message}` + ); + fs.appendFile( + `${directoryPath}/output/log/missing_words.txt`, + word + "\n", + (err) => { + if (err) { + writeToLog( + `[LOG-ERR] WHILE LOGGING MISSING of [${word}]:${err.message}` + ); + } + } + ); + } + } }); diff --git a/src/mkanki.js b/src/mkanki.js index 4699008..80c0bd1 100644 --- a/src/mkanki.js +++ b/src/mkanki.js @@ -5,106 +5,174 @@ const anki = require("mkanki"); const writeToLog = require("./utils/logger"); const directoryPath = __dirname; -const HTML_DIR = directoryPath + "/output/html/"; +const DEFAULT_HTML_DIR = directoryPath + "/output/html/"; +const DEFAULT_AUDIO_DIR = directoryPath + "/output/audios/"; const DEFAULT_DECK_NAME = "IELTS-CamDict-Words"; const DEFAULT_NOTE_TYPE_NAME = "BasicCamCard"; const DEFAULT_CSS_FILENAME = "common.css"; const DEFAULT_APKG_NAME = DEFAULT_DECK_NAME + ".apkg"; -const cssData = fs.readFileSync(HTML_DIR + DEFAULT_CSS_FILENAME, "utf8"); +const cssData = fs.readFileSync(DEFAULT_HTML_DIR + DEFAULT_CSS_FILENAME, "utf8"); +const audioPlayScriptBlock = ``; const model = new anki.Model({ - name: DEFAULT_NOTE_TYPE_NAME, - id: Date.now().toString(), - flds: [{ name: "Word" }, { name: "Front" }, { name: "Back" }], - req: [[0, "all", [0]]], - tmpls: [ - { - name: "Card 1", - qfmt: "{{Front}}", - afmt: "{{FrontSide}}\n\n
\n\n{{Back}}", - }, - ], - css: cssData, + name: DEFAULT_NOTE_TYPE_NAME, + id: Date.now().toString(), + flds: [{ name: "Word" }, { name: "Front" }, { name: "Back" }], + req: [[0, "all", [0]]], + tmpls: [ + { + name: "Card 1", + qfmt: "{{Front}}" + `\n${audioPlayScriptBlock}`, + afmt: "{{Back}}" + `\n${audioPlayScriptBlock}`, + }, + ], + css: cssData, }); const deck = new anki.Deck(Date.now(), DEFAULT_DECK_NAME); const package = new anki.Package(); async function processHtmlFilesNew() { - try { - const files = await new Promise((resolve, reject) => { - fs.readdir(HTML_DIR, (err, files) => { - if (err) { - writeToLog(err.message); - reject(err); - } - resolve(files); - }); - }); - let cnt = 0; - function addNote(file) { - return new Promise((resolve, reject) => { - if (path.extname(file) === ".html") { - const currentWord = file.match(/^(.+)\.html$/)[1]; - const filePath = path.join(HTML_DIR, file); - fs.readFile(filePath, "utf8", (err, data) => { - if (err) { - writeToLog(err.message); - reject(err); - } else { - try { - const $ = cheerio.load(data); - let front = ""; - $(".pos-header.dpos-h") - .toArray() - .forEach((ele) => { - front += $(ele).html() + "
"; - }); - const back = $("body").html(); - if (!front || !back) { - throw new Error( - "EMPTY RESULT OF EXTRACTING FRONT OR BACK NOTES" - ); - } - const tags = $(".def-info.ddef-info") - .toArray() - .map((ele) => - $(ele).find("span[class*='epp-xref dxref']").html() - ) - .filter((ele) => ele !== null); - let tagResult = []; - if (tags.length != 0) { - const tagsDictinct = [...new Set(tags)].sort(); - tagResult = tagsDictinct; - } - deck.addNote(model.note([currentWord, front, back], tagResult)); - console.log( - `[INFO] ${++cnt}/${ - files.length - 1 - } CURRENT WORD: ${currentWord}` - ); - resolve(); - } catch (err) { - writeToLog(`[ERR] ERROR PROCESSSING ${file}: ${err.message}`); - reject(err); - } - } - }); - } else { - resolve(); - } - }); - } - await Promise.allSettled(files.map(addNote)); - } catch (err) { - writeToLog(`[ERR] UNKNONW ERROR ${err.message}`); - } + try { + const files = await new Promise((resolve, reject) => { + fs.readdir(DEFAULT_HTML_DIR, (err, files) => { + if (err) { + writeToLog(err.message); + reject(err); + } + resolve(files); + }); + }); + let cnt = 0; + function addNote(file) { + return new Promise((resolve, reject) => { + if (path.extname(file) === ".html") { + const currentWord = file.match(/^(.+)\.html$/)[1]; + const filePath = path.join(DEFAULT_HTML_DIR, file); + fs.readFile(filePath, "utf8", (err, data) => { + if (err) { + writeToLog(err.message); + reject(err); + } else { + try { + const $ = cheerio.load(data); + let front = ""; + $(".pos-header.dpos-h") + .toArray() + .forEach((ele) => { + front += + $(ele).html() + "
"; + }); + front += $(".audioGroup").html(); + const back = $("body").html(); + if (!front || !back) { + throw new Error( + "EMPTY RESULT OF EXTRACTING FRONT OR BACK NOTES" + ); + } + const tags = $(".def-info.ddef-info") + .toArray() + .map((ele) => + $(ele) + .find( + "span[class*='epp-xref dxref']" + ) + .html() + ) + .filter((ele) => ele !== null); + let tagResult = []; + if (tags.length != 0) { + const tagsDictinct = [ + ...new Set(tags), + ].sort(); + tagResult = tagsDictinct; + } + deck.addNote( + model.note( + [currentWord, front, back], + tagResult + ) + ); + console.log( + `[INFO] ${++cnt}/${ + files.length - 1 + } CURRENT WORD: ${currentWord}` + ); + resolve(); + } catch (err) { + writeToLog( + `[ERR] ERROR PROCESSSING ${file}: ${err.message}` + ); + reject(err); + } + } + }); + } else { + resolve(); + } + }); + } + await Promise.allSettled(files.map(addNote)); + } catch (err) { + writeToLog(`[ERR] UNKNONW ERROR ${err.message}`); + } +} + +async function processAudioFiles() { + try { + const audioFiles = await new Promise((resolve, reject) => { + fs.readdir(DEFAULT_AUDIO_DIR, (err, files) => { + if (err) { + writeToLog(err.message); + reject(err); + } + resolve(files); + }); + }); + let cnt = 0; + function addAudio(file) { + return new Promise((resolve, reject) => { + if (path.extname(file) === ".mp3") { + try { + const audioFilePath = path.join(DEFAULT_AUDIO_DIR, file); + /** + * Note: package.addMediaFile + * 1st param for absolute path, + * 2nd param for real filename */ + package.addMediaFile(audioFilePath, file); + console.log( + `[INFO] ${++cnt}/${ + audioFiles.length + } CURRENT AUDIO: ${file}` + ); + resolve(file); + } catch (err) { + writeToLog( + `[ERR] WHILE ADDING AUDIO ${file} ` + err.message + ); + reject(err); + } + } else { + resolve(`NOT FOUND ${file}`); + } + }); + } + await Promise.allSettled(audioFiles.map(addAudio)); + } catch (err) { + writeToLog(`[ERR] UNKNONW ERROR ${err.message}`); + } } async function convert() { - await processHtmlFilesNew(); - package.addDeck(deck); - package.writeToFile(DEFAULT_APKG_NAME); + await processHtmlFilesNew(); + await processAudioFiles(); + package.addDeck(deck); + package.writeToFile(DEFAULT_APKG_NAME); } convert(); diff --git a/src/output/html/common.css b/src/output/html/common.css index dbdc65c..f29ea30 100644 --- a/src/output/html/common.css +++ b/src/output/html/common.css @@ -2,6 +2,10 @@ padding-left:10px; } +.sound{ + display: none; +} + img, legend { border: 0 diff --git a/src/utils/audio_downloader.js b/src/utils/audio_downloader.js new file mode 100644 index 0000000..1209faa --- /dev/null +++ b/src/utils/audio_downloader.js @@ -0,0 +1,90 @@ +const fs = require("fs"); +const bent = require("bent"); +const writeToLog = require("./logger"); +const stream = require("stream"); +const { promisify } = require("util"); +const pipeline = promisify(stream.pipeline); + +const DEFAULT_AUDIO_PATH = "src/output/audios"; +const DEFAULT_ROOT_URL_STR = "https://dictionary.cambridge.org"; + +async function downloadFile(urlToken, filePath) { + const get = bent(DEFAULT_ROOT_URL_STR); + const readable = await get(urlToken); + const writable = fs.createWriteStream(filePath); + const res = await pipeline(readable, writable).then((res) => { + return Promise.resolve(extractAudioFilename(urlToken, true)); + }); + return res; +} + +function mkdir4Audios() { + const directoryName = DEFAULT_AUDIO_PATH; + if (!fs.existsSync(directoryName)) { + try { + fs.mkdirSync(directoryName, { recursive: true }); + writeToLog(`[INFO] DIR ${directoryName} CREATED`); + return true; + } catch (error) { + writeToLog(`[ERR] FAILED TO CREATE DIR${directoryName}:`, error); + return false; + } + } else { + // writeToLog(`[INFO] DIR ${directoryName} EXISTED`); + return true; + } +} + +function extractAudioFilename(url, isSuffix) { + const regex = /\/([^/]+\.mp3)$/; + if (!isSuffix) return url.match(/\/([^/]+)\.mp3$/)[1]; + return url.match(regex)[1]; +} + +function fetchAudios($, word, audioDIVsBuffer) { + const daud = $(".entry-body .daud"); + const pronunUrlArray = daud + .toArray() + .flatMap((e) => + e.childNodes.filter((k) => k.type == "tag" && k.name == "audio") + ) + .map((e) => e.childNodes[3].attribs["src"]); + /** ============================================================ + * construct div elements for anki pronun audio support + */ + const pron = $(".entry-body .pron.dpron").toArray(); + for (let i = 0; i < pron.length; i++) { + const target = + word + "-" + extractAudioFilename(pronunUrlArray[i], false); + // console.log(target); + audioDIVsBuffer.push( + `
[sound:${target}.mp3]
` + ); + $(pron[i]).attr("onClick", `playAudio('${target}');`); + } + /** ============================================================ */ + const pronunUrlArrayUnique = [...new Set(pronunUrlArray)]; + // console.log(pronunUrlArrayUnique); + const isMkdir = mkdir4Audios(); + if (!isMkdir) return; + Promise.allSettled( + pronunUrlArrayUnique.map((url) => { + const regex = /\/([^/]+\.mp3)$/; + const filename = url.match(regex)[1]; + return downloadFile( + url, + DEFAULT_AUDIO_PATH + `/${word}-${filename}` + ); + }) + ) + .then((result) => + writeToLog( + `[INFO] AUDIO FILE [${result + .map((e) => e.value) + .join(", ")}] WRITTEN` + ) + ) + .catch((err) => writeToLog(err.message)); +} + +module.exports = fetchAudios;