sub nodes
+ $("a[href*='dictionary'] span").removeClass("dx-h");
+ $("a[href*='dictionary']").removeAttr("href");
+ const target =
+ `` +
+ $(".entry-body").html() +
+ "" +
+ audioDIVsString +
+ "
";
+ fs.writeFile(
+ `${DEFAULT_HTML_DIR}${word}.html`,
+ target,
+ (err) => {
+ if (err) {
+ writeToLog(
+ `[ERR] WHILE WRITING [${word}]: ${err.message}`
+ );
+ } else {
+ writeToLog(`[OK] WORD [${word}] WRITTEN.`);
+ }
+ }
+ );
+ } catch (err) {
+ const fetchUrl = urlString + "/" + word;
+ writeToLog(
+ `[ERR] WHILE FINDING ${word}: ${fetchUrl} ${err.message}`
+ );
+ fs.appendFile(
+ `${directoryPath}/output/log/missing_words.txt`,
+ word + "\n",
+ (err) => {
+ if (err) {
+ writeToLog(
+ `[LOG-ERR] WHILE LOGGING MISSING of [${word}]:${err.message}`
+ );
+ }
+ }
+ );
+ }
+ }
});
diff --git a/src/mkanki.js b/src/mkanki.js
index 4699008..80c0bd1 100644
--- a/src/mkanki.js
+++ b/src/mkanki.js
@@ -5,106 +5,174 @@ const anki = require("mkanki");
const writeToLog = require("./utils/logger");
const directoryPath = __dirname;
-const HTML_DIR = directoryPath + "/output/html/";
+const DEFAULT_HTML_DIR = directoryPath + "/output/html/";
+const DEFAULT_AUDIO_DIR = directoryPath + "/output/audios/";
const DEFAULT_DECK_NAME = "IELTS-CamDict-Words";
const DEFAULT_NOTE_TYPE_NAME = "BasicCamCard";
const DEFAULT_CSS_FILENAME = "common.css";
const DEFAULT_APKG_NAME = DEFAULT_DECK_NAME + ".apkg";
-const cssData = fs.readFileSync(HTML_DIR + DEFAULT_CSS_FILENAME, "utf8");
+const cssData = fs.readFileSync(DEFAULT_HTML_DIR + DEFAULT_CSS_FILENAME, "utf8");
+const audioPlayScriptBlock = ``;
const model = new anki.Model({
- name: DEFAULT_NOTE_TYPE_NAME,
- id: Date.now().toString(),
- flds: [{ name: "Word" }, { name: "Front" }, { name: "Back" }],
- req: [[0, "all", [0]]],
- tmpls: [
- {
- name: "Card 1",
- qfmt: "{{Front}}",
- afmt: "{{FrontSide}}\n\n
\n\n{{Back}}",
- },
- ],
- css: cssData,
+ name: DEFAULT_NOTE_TYPE_NAME,
+ id: Date.now().toString(),
+ flds: [{ name: "Word" }, { name: "Front" }, { name: "Back" }],
+ req: [[0, "all", [0]]],
+ tmpls: [
+ {
+ name: "Card 1",
+ qfmt: "{{Front}}" + `\n${audioPlayScriptBlock}`,
+ afmt: "{{Back}}" + `\n${audioPlayScriptBlock}`,
+ },
+ ],
+ css: cssData,
});
const deck = new anki.Deck(Date.now(), DEFAULT_DECK_NAME);
const package = new anki.Package();
async function processHtmlFilesNew() {
- try {
- const files = await new Promise((resolve, reject) => {
- fs.readdir(HTML_DIR, (err, files) => {
- if (err) {
- writeToLog(err.message);
- reject(err);
- }
- resolve(files);
- });
- });
- let cnt = 0;
- function addNote(file) {
- return new Promise((resolve, reject) => {
- if (path.extname(file) === ".html") {
- const currentWord = file.match(/^(.+)\.html$/)[1];
- const filePath = path.join(HTML_DIR, file);
- fs.readFile(filePath, "utf8", (err, data) => {
- if (err) {
- writeToLog(err.message);
- reject(err);
- } else {
- try {
- const $ = cheerio.load(data);
- let front = "";
- $(".pos-header.dpos-h")
- .toArray()
- .forEach((ele) => {
- front += $(ele).html() + "
";
- });
- const back = $("body").html();
- if (!front || !back) {
- throw new Error(
- "EMPTY RESULT OF EXTRACTING FRONT OR BACK NOTES"
- );
- }
- const tags = $(".def-info.ddef-info")
- .toArray()
- .map((ele) =>
- $(ele).find("span[class*='epp-xref dxref']").html()
- )
- .filter((ele) => ele !== null);
- let tagResult = [];
- if (tags.length != 0) {
- const tagsDictinct = [...new Set(tags)].sort();
- tagResult = tagsDictinct;
- }
- deck.addNote(model.note([currentWord, front, back], tagResult));
- console.log(
- `[INFO] ${++cnt}/${
- files.length - 1
- } CURRENT WORD: ${currentWord}`
- );
- resolve();
- } catch (err) {
- writeToLog(`[ERR] ERROR PROCESSSING ${file}: ${err.message}`);
- reject(err);
- }
- }
- });
- } else {
- resolve();
- }
- });
- }
- await Promise.allSettled(files.map(addNote));
- } catch (err) {
- writeToLog(`[ERR] UNKNONW ERROR ${err.message}`);
- }
+ try {
+ const files = await new Promise((resolve, reject) => {
+ fs.readdir(DEFAULT_HTML_DIR, (err, files) => {
+ if (err) {
+ writeToLog(err.message);
+ reject(err);
+ }
+ resolve(files);
+ });
+ });
+ let cnt = 0;
+ function addNote(file) {
+ return new Promise((resolve, reject) => {
+ if (path.extname(file) === ".html") {
+ const currentWord = file.match(/^(.+)\.html$/)[1];
+ const filePath = path.join(DEFAULT_HTML_DIR, file);
+ fs.readFile(filePath, "utf8", (err, data) => {
+ if (err) {
+ writeToLog(err.message);
+ reject(err);
+ } else {
+ try {
+ const $ = cheerio.load(data);
+ let front = "";
+ $(".pos-header.dpos-h")
+ .toArray()
+ .forEach((ele) => {
+ front +=
+ $(ele).html() + "
";
+ });
+ front += $(".audioGroup").html();
+ const back = $("body").html();
+ if (!front || !back) {
+ throw new Error(
+ "EMPTY RESULT OF EXTRACTING FRONT OR BACK NOTES"
+ );
+ }
+ const tags = $(".def-info.ddef-info")
+ .toArray()
+ .map((ele) =>
+ $(ele)
+ .find(
+ "span[class*='epp-xref dxref']"
+ )
+ .html()
+ )
+ .filter((ele) => ele !== null);
+ let tagResult = [];
+ if (tags.length != 0) {
+ const tagsDictinct = [
+ ...new Set(tags),
+ ].sort();
+ tagResult = tagsDictinct;
+ }
+ deck.addNote(
+ model.note(
+ [currentWord, front, back],
+ tagResult
+ )
+ );
+ console.log(
+ `[INFO] ${++cnt}/${
+ files.length - 1
+ } CURRENT WORD: ${currentWord}`
+ );
+ resolve();
+ } catch (err) {
+ writeToLog(
+ `[ERR] ERROR PROCESSSING ${file}: ${err.message}`
+ );
+ reject(err);
+ }
+ }
+ });
+ } else {
+ resolve();
+ }
+ });
+ }
+ await Promise.allSettled(files.map(addNote));
+ } catch (err) {
+ writeToLog(`[ERR] UNKNONW ERROR ${err.message}`);
+ }
+}
+
+async function processAudioFiles() {
+ try {
+ const audioFiles = await new Promise((resolve, reject) => {
+ fs.readdir(DEFAULT_AUDIO_DIR, (err, files) => {
+ if (err) {
+ writeToLog(err.message);
+ reject(err);
+ }
+ resolve(files);
+ });
+ });
+ let cnt = 0;
+ function addAudio(file) {
+ return new Promise((resolve, reject) => {
+ if (path.extname(file) === ".mp3") {
+ try {
+ const audioFilePath = path.join(DEFAULT_AUDIO_DIR, file);
+ /**
+ * Note: package.addMediaFile
+ * 1st param for absolute path,
+ * 2nd param for real filename */
+ package.addMediaFile(audioFilePath, file);
+ console.log(
+ `[INFO] ${++cnt}/${
+ audioFiles.length
+ } CURRENT AUDIO: ${file}`
+ );
+ resolve(file);
+ } catch (err) {
+ writeToLog(
+ `[ERR] WHILE ADDING AUDIO ${file} ` + err.message
+ );
+ reject(err);
+ }
+ } else {
+ resolve(`NOT FOUND ${file}`);
+ }
+ });
+ }
+ await Promise.allSettled(audioFiles.map(addAudio));
+ } catch (err) {
+ writeToLog(`[ERR] UNKNONW ERROR ${err.message}`);
+ }
}
async function convert() {
- await processHtmlFilesNew();
- package.addDeck(deck);
- package.writeToFile(DEFAULT_APKG_NAME);
+ await processHtmlFilesNew();
+ await processAudioFiles();
+ package.addDeck(deck);
+ package.writeToFile(DEFAULT_APKG_NAME);
}
convert();
diff --git a/src/output/html/common.css b/src/output/html/common.css
index dbdc65c..f29ea30 100644
--- a/src/output/html/common.css
+++ b/src/output/html/common.css
@@ -2,6 +2,10 @@
padding-left:10px;
}
+.sound{
+ display: none;
+}
+
img,
legend {
border: 0
diff --git a/src/utils/audio_downloader.js b/src/utils/audio_downloader.js
new file mode 100644
index 0000000..1209faa
--- /dev/null
+++ b/src/utils/audio_downloader.js
@@ -0,0 +1,90 @@
+const fs = require("fs");
+const bent = require("bent");
+const writeToLog = require("./logger");
+const stream = require("stream");
+const { promisify } = require("util");
+const pipeline = promisify(stream.pipeline);
+
+const DEFAULT_AUDIO_PATH = "src/output/audios";
+const DEFAULT_ROOT_URL_STR = "https://dictionary.cambridge.org";
+
+async function downloadFile(urlToken, filePath) {
+ const get = bent(DEFAULT_ROOT_URL_STR);
+ const readable = await get(urlToken);
+ const writable = fs.createWriteStream(filePath);
+ const res = await pipeline(readable, writable).then((res) => {
+ return Promise.resolve(extractAudioFilename(urlToken, true));
+ });
+ return res;
+}
+
+function mkdir4Audios() {
+ const directoryName = DEFAULT_AUDIO_PATH;
+ if (!fs.existsSync(directoryName)) {
+ try {
+ fs.mkdirSync(directoryName, { recursive: true });
+ writeToLog(`[INFO] DIR ${directoryName} CREATED`);
+ return true;
+ } catch (error) {
+ writeToLog(`[ERR] FAILED TO CREATE DIR${directoryName}:`, error);
+ return false;
+ }
+ } else {
+ // writeToLog(`[INFO] DIR ${directoryName} EXISTED`);
+ return true;
+ }
+}
+
+function extractAudioFilename(url, isSuffix) {
+ const regex = /\/([^/]+\.mp3)$/;
+ if (!isSuffix) return url.match(/\/([^/]+)\.mp3$/)[1];
+ return url.match(regex)[1];
+}
+
+function fetchAudios($, word, audioDIVsBuffer) {
+ const daud = $(".entry-body .daud");
+ const pronunUrlArray = daud
+ .toArray()
+ .flatMap((e) =>
+ e.childNodes.filter((k) => k.type == "tag" && k.name == "audio")
+ )
+ .map((e) => e.childNodes[3].attribs["src"]);
+ /** ============================================================
+ * construct div elements for anki pronun audio support
+ */
+ const pron = $(".entry-body .pron.dpron").toArray();
+ for (let i = 0; i < pron.length; i++) {
+ const target =
+ word + "-" + extractAudioFilename(pronunUrlArray[i], false);
+ // console.log(target);
+ audioDIVsBuffer.push(
+ `[sound:${target}.mp3]
`
+ );
+ $(pron[i]).attr("onClick", `playAudio('${target}');`);
+ }
+ /** ============================================================ */
+ const pronunUrlArrayUnique = [...new Set(pronunUrlArray)];
+ // console.log(pronunUrlArrayUnique);
+ const isMkdir = mkdir4Audios();
+ if (!isMkdir) return;
+ Promise.allSettled(
+ pronunUrlArrayUnique.map((url) => {
+ const regex = /\/([^/]+\.mp3)$/;
+ const filename = url.match(regex)[1];
+ return downloadFile(
+ url,
+ DEFAULT_AUDIO_PATH + `/${word}-${filename}`
+ );
+ })
+ )
+ .then((result) =>
+ writeToLog(
+ `[INFO] AUDIO FILE [${result
+ .map((e) => e.value)
+ .join(", ")}] WRITTEN`
+ )
+ )
+ .catch((err) => writeToLog(err.message));
+}
+
+module.exports = fetchAudios;