Skip to content

Commit

Permalink
[update] pronun support
Browse files Browse the repository at this point in the history
  • Loading branch information
u3847 committed Aug 25, 2023
1 parent 470ce65 commit 8847716
Show file tree
Hide file tree
Showing 6 changed files with 370 additions and 186 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
node_modules/
src/output/html/*.html
src/output/log/
src/output/log/
src/output/audios/
33 changes: 20 additions & 13 deletions src/main-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ const path = require("path");
const { Worker, isMainThread, workerData } = require("worker_threads");
const writeToLog = require("./utils/logger");
const DEFAULT_WORKER_NUM = 8;
const fetchAudios = require("./utils/audio_downloader");

// define directory path and file path
const directoryPath = __dirname;
const filePath = path.join(__dirname, "words.txt");
const DEFAULT_WORDLIST_PATH = path.join(__dirname, "words.txt");
const DEFAULT_HTML_DIR = `${directoryPath}/output/html/`;

// define URL prefix
const urlString =
Expand Down Expand Up @@ -40,6 +42,9 @@ async function processWord(word) {
const get = bent(urlString, "GET", "string", 200);
const data = await get(`/` + word);
const $ = cheerio.load(data);
let audioDIVsBuffer = [];
fetchAudios($, word, audioDIVsBuffer);
const audioDIVsString = audioDIVsBuffer.join("");
// remove unusable icon and hint nodes
$(".daud").remove();
$(".i.i-caret-right.dtrans.fs18.lpb-4").remove();
Expand All @@ -52,17 +57,20 @@ async function processWord(word) {
$("a[href*='dictionary'] span").removeClass("dx-h");
$("a[href*='dictionary']").removeAttr("href");
const target =
`<html><head><link rel="stylesheet" href="common.css"></head>` +
`<html><head><link rel="stylesheet" href="common.css"></head><body>` +
$(".entry-body").html() +
`</html>`;
"<div class='audioGroup'>" +
audioDIVsString +
"</div></body></html>";
fs.writeFile(
`${directoryPath}/output/html/${word}.html`,
`${DEFAULT_HTML_DIR}${word}.html`,
target,
(err) => {
if (err) {
console.log(`[ERR] While writing [${word}]:`, err);
} else {
console.log(`[OK] Word [${word}] is written.`);
writeToLog(
`[ERR] WHILE WRITING [${word}]: ${err.message}`
);
writeToLog(`[OK] WORD [${word}] WRITTEN.`);
}
}
);
Expand All @@ -73,9 +81,8 @@ async function processWord(word) {
word + "\n",
(err) => {
if (err) {
console.error(
`[LOG-ERR] WHILE LOGGING MISSING OF [${word}]:`,
err.message
writeToLog(
`[LOG-ERR] WHILE LOGGING MISSING of [${word}]:${err.message}`
);
}
}
Expand All @@ -86,7 +93,7 @@ async function processWord(word) {
async function workerExecution() {
if (isMainThread) {
// read file and process the token of each line
fs.readFile(filePath, "utf8", async (err, data) => {
fs.readFile(DEFAULT_WORDLIST_PATH, "utf8", async (err, data) => {
if (err) {
writeToLog(`[ERR] ERROR READING FILE : ${err.message}`);
return;
Expand All @@ -110,7 +117,7 @@ async function workerExecution() {
});

worker.on("exit", () => {
writeToLog(`[INF] WORKER-${i} EXIT.`);
writeToLog(`[INFO] WORKER-${i} EXIT.`);
});

workers.push(worker);
Expand All @@ -122,7 +129,7 @@ async function workerExecution() {
worker.on("error", reject);
});
}
writeToLog("[INF] ALL WORKERS FINISHED.");
writeToLog("[INFO] ALL WORKERS FINISHED.");
});
} else {
const words = workerData;
Expand Down
188 changes: 101 additions & 87 deletions src/main.js
Original file line number Diff line number Diff line change
@@ -1,107 +1,121 @@
/*
* Copyright (c) 2023 @Quasimurdock
*
* Portions of this file are subject to the Apache License, Version 2.0 ("License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
* */
* Copyright (c) 2023 @Quasimurdock
*
* Portions of this file are subject to the Apache License, Version 2.0 ("License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
* */

const fs = require("fs");
const bent = require("bent");
const cheerio = require("cheerio");
const path = require("path");
const writeToLog = require("./utils/logger");
const fetchAudios = require("./utils/audio_downloader");

// define directory path and file path
const directoryPath = __dirname;
const filePath = path.join(__dirname, "words.txt");
const DEFAULT_WORDLIST_PATH = path.join(__dirname, "words.txt");
const DEFAULT_HTML_DIR = `${directoryPath}/output/html/`;

// define URL prefix
const urlString =
"https://dictionary.cambridge.org/dictionary/english-chinese-simplified";
"https://dictionary.cambridge.org/dictionary/english-chinese-simplified";

function checkDuplicateFiles(directory, filename) {
const files = fs.readdirSync(directory); // read all files under the directory
for (let file of files) {
if (file === filename) {
return true; // duplicated files existed
}
}
return false;
const files = fs.readdirSync(directory); // read all files under the directory
for (let file of files) {
if (file === filename) {
return true; // duplicated files existed
}
}
return false;
}

// read file and process the token of each line
fs.readFile(filePath, "utf8", async (err, data) => {
if (err) {
console.error("Error reading file:", err);
return;
}
fs.readFile(DEFAULT_WORDLIST_PATH, "utf8", async (err, data) => {
if (err) {
console.error("ERROR READING FILE:", err);
writeToLog(`[ERR] FAILED TO READ FILE: ${err.message}`);
return;
}

const words = data.split("\n");
for (let word of words) {
word = word.trim();
// judge if empty
if (word === "" || word.length == 0) {
continue;
}
const words = data.split("\n");
for (let word of words) {
word = word.trim();
// judge if empty
if (word === "" || word.length == 0) {
continue;
}

// check if duplicated file exists
if (checkDuplicateFiles(directoryPath + "/output/html", word + ".html")) {
console.log(`[SKIP] Duplicated [${word}] file exists.`);
continue;
}
// concat strings
try {
const get = bent(urlString, "GET", "string", 200);
const data = await get(`/` + word);
const $ = cheerio.load(data);
// remove unusable icon and hint nodes
$(".daud").remove();
$(".i.i-caret-right.dtrans.fs18.lpb-4").remove();
$(".dwla.wordlist-add-button").remove();
$(".hfr.lpb-2").remove();
$("i.i-plus.ca_hi").remove();
$("script").remove();
// remove all dictionary href links from <a> nodes with
// and underlines of its <span> sub nodes
$("a[href*='dictionary'] span").removeClass("dx-h");
$("a[href*='dictionary']").removeAttr("href");
const target =
`<html><head><link rel="stylesheet" href="common.css"></head>` +
$(".entry-body").html() +
`</html>`;
fs.writeFile(
`${directoryPath}/output/html/${word}.html`,
target,
(err) => {
if (err) {
console.log(`[ERR] While writing [${word}]:`, err);
} else {
console.log(`[OK] Word [${word}] is written.`);
}
}
);
} catch (error) {
const fetchUrl = urlString + "/" + word;
console.error(`[ERR] While finding ${word}:`, fetchUrl);
console.error(error);
fs.appendFile(
`${directoryPath}/output/log/missing_words.txt`,
word + "\n",
(err) => {
if (err) {
console.log(`[LOG-ERR] While logging missing of [${word}]:`, err);
} else {
console.log(`[LOG-OK] Missing of word [${word}] is logged.`);
}
}
);
}
}
// check if duplicated file exists
if (
checkDuplicateFiles(directoryPath + "/output/html", word + ".html")
) {
console.log(`[SKIP] Duplicated [${word}] file exists.`);
continue;
}
// concat strings
try {
const get = bent(urlString, "GET", "string", 200);
const data = await get(`/` + word);
const $ = cheerio.load(data);
let audioDIVsBuffer = [];
fetchAudios($, word, audioDIVsBuffer);
const audioDIVsString = audioDIVsBuffer.join("");
// remove unusable icon and hint nodes
$(".daud").remove();
$(".i.i-caret-right.dtrans.fs18.lpb-4").remove();
$(".dwla.wordlist-add-button").remove();
$(".hfr.lpb-2").remove();
$("i.i-plus.ca_hi").remove();
$("script").remove();
// remove all dictionary href links from <a> nodes with
// and underlines of its <span> sub nodes
$("a[href*='dictionary'] span").removeClass("dx-h");
$("a[href*='dictionary']").removeAttr("href");
const target =
`<html><head><link rel="stylesheet" href="common.css"></head><body>` +
$(".entry-body").html() +
"<div class='audioGroup'>" +
audioDIVsString +
"</div></body></html>";
fs.writeFile(
`${DEFAULT_HTML_DIR}${word}.html`,
target,
(err) => {
if (err) {
writeToLog(
`[ERR] WHILE WRITING [${word}]: ${err.message}`
);
} else {
writeToLog(`[OK] WORD [${word}] WRITTEN.`);
}
}
);
} catch (err) {
const fetchUrl = urlString + "/" + word;
writeToLog(
`[ERR] WHILE FINDING ${word}: ${fetchUrl} ${err.message}`
);
fs.appendFile(
`${directoryPath}/output/log/missing_words.txt`,
word + "\n",
(err) => {
if (err) {
writeToLog(
`[LOG-ERR] WHILE LOGGING MISSING of [${word}]:${err.message}`
);
}
}
);
}
}
});
Loading

0 comments on commit 8847716

Please sign in to comment.