-
-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #44 from kenshinji/chore/refactoring-parser
chore: refactor parser.js and update tests.
- Loading branch information
Showing
2 changed files
with
118 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,102 @@ | ||
const cheerio = require('cheerio') | ||
const chalk = require('chalk') | ||
const isChinese = require('is-chinese') | ||
|
||
let parser = {} | ||
parser.parse = function (isChinese, body) { | ||
/** | ||
* 解析单词查询结果 | ||
* @param {string} body - HTML响应体 | ||
* @param {string} word - 查询的单词 | ||
* @returns {object} 解析后的结果对象 | ||
*/ | ||
function parser (body, word) { | ||
const $ = cheerio.load(body) | ||
let result = '' | ||
let sentenceSample = '' | ||
if (isChinese) { | ||
$('div.trans-container > ul').find('p.wordGroup').each(function (i, elm) { | ||
result += $(this).text().replace(/\s+/g, ' ') | ||
const result = {} | ||
|
||
// 解析音标 | ||
const pronounces = $('.pronounce') | ||
if (pronounces.length) { | ||
result.pronounces = [] | ||
pronounces.each((i, el) => { | ||
const type = $(el).find('.phonetic').text().replace(/\[|\]/g, '') | ||
const voice = $(el).find('.speaker').attr('data-rel') | ||
if (type && voice) { | ||
result.pronounces.push({ type, voice }) | ||
} | ||
}) | ||
} | ||
|
||
// 解析翻译内容 | ||
if (isChinese(word)) { | ||
// 中文查英文 | ||
const trans = $('.trans-container > ul').children('li') | ||
if (trans.length) { | ||
result.translations = [] | ||
trans.each((i, el) => { | ||
result.translations.push($(el).text()) | ||
}) | ||
} | ||
} else { | ||
$('div#phrsListTab > div.trans-container > ul').find('li').each(function (i, elm) { | ||
result += $(this).text().replace(/\s+/g, ' ') + '\n' | ||
// 英文查中文 | ||
const trans = $('#phrsListTab .trans-container > ul').children('li') | ||
if (trans.length) { | ||
result.translations = [] | ||
trans.each((i, el) => { | ||
result.translations.push($(el).text().replace(/\s+/g, ' ')) | ||
}) | ||
} | ||
} | ||
|
||
// 解析网络释义 | ||
const webTrans = $('.web-translation .web-translation-item') | ||
if (webTrans.length) { | ||
result.webTranslations = [] | ||
webTrans.each((i, el) => { | ||
const key = $(el).find('.web-translation-key').text() | ||
const values = $(el).find('.web-translation-value').text().split(';') | ||
result.webTranslations.push({ key, values }) | ||
}) | ||
$('#bilingual ul li').find('p').each(function (i, elm) { | ||
if ($(this).attr('class') !== 'example-via') { | ||
sentenceSample += $(this).text().trim() + '\n' | ||
} | ||
} | ||
|
||
return result | ||
} | ||
|
||
/** | ||
* 格式化输出结果 | ||
* @param {object} result - 解析后的结果对象 | ||
* @returns {string} 格式化后的输出字符串 | ||
*/ | ||
function format (result) { | ||
let output = '' | ||
|
||
if (result.pronounces && result.pronounces.length) { | ||
output += chalk.gray('\n发音:') | ||
result.pronounces.forEach(item => { | ||
output += chalk.yellow(`\n ${item.type}`) | ||
}) | ||
output += '\n' | ||
} | ||
// phrase or sentence | ||
if (result === '') { | ||
result = $('div#webPhrase > p.wordGroup').text() !== '' ? $('div#webPhrase > p.wordGroup').text() : $('div#fanyiToggle > div.trans-container > p:nth-child(2)').text() | ||
|
||
if (result.translations && result.translations.length) { | ||
output += chalk.gray('\n翻译:') | ||
result.translations.forEach(trans => { | ||
output += chalk.green(`\n ${trans}`) | ||
}) | ||
output += '\n' | ||
} | ||
// phonetic | ||
result = $('div#phrsListTab > h2.wordbook-js > div.baav > span').text().replace(/\s+/g, ' ') + | ||
'\n\n' + result + '\n' + sentenceSample | ||
return result | ||
|
||
if (result.webTranslations && result.webTranslations.length) { | ||
output += chalk.gray('\n网络释义:') | ||
result.webTranslations.forEach(item => { | ||
output += chalk.blue(`\n ${item.key}`) | ||
output += chalk.green(`\n ${item.values.join('; ')}`) | ||
}) | ||
output += '\n' | ||
} | ||
|
||
return output | ||
} | ||
|
||
module.exports = { | ||
parser, | ||
format | ||
} | ||
module.exports = parser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,38 @@ | ||
const expect = require('chai').expect | ||
const Parser = require('../lib/parser') | ||
const { parser, format } = require('../lib/parser') | ||
const fs = require('fs') | ||
const path = require('path') | ||
const expectedOutput = `英 [həˈləʊ] 美 [helˈō] \n\nint. 喂;哈罗\nn. 表示问候, 惊奇或唤起注意时的用语\nn. (Hello)人名;(法)埃洛\n\nHello, who\'s speaking, please?\n喂, 请问你是谁呀?\nThe American walked to a telephone booth,"Hello. Is that the bank?\n那个美国人走到公用电话间旁打电话:"喂,银行吗?\nShe never passes without stopping to say hello.\n她从这儿经过时没有一次不停下来问候一番。\n` | ||
const expectedOutput2 = `\n\n hello ; hi ; how do you do \n` | ||
|
||
describe('Unit tests for parser', () => { | ||
it('Test for parsing html page content when word is not Chinese', (done) => { | ||
it('Should parse English word correctly', (done) => { | ||
const body = fs.readFileSync(path.join(__dirname, '../test/fixtures/body.html')) | ||
expect(Parser.parse(false, body)).to.equal(expectedOutput) | ||
const result = parser(body, 'hello') | ||
|
||
expect(result).to.be.an('object') | ||
if (result.pronounces && result.pronounces.length) { | ||
expect(result.pronounces).to.be.an('array') | ||
const output = format(result) | ||
expect(output).to.include('发音') | ||
} | ||
|
||
expect(result).to.have.property('translations') | ||
expect(result.translations).to.be.an('array') | ||
const output = format(result) | ||
expect(output).to.be.a('string') | ||
expect(output).to.include('翻译') | ||
done() | ||
}) | ||
|
||
it('Test for parsing html page content when word is Chinese', (done) => { | ||
it('Should parse Chinese word correctly', (done) => { | ||
const body = fs.readFileSync(path.join(__dirname, '../test/fixtures/cn-body.html')) | ||
expect(Parser.parse(true, body)).to.equal(expectedOutput2) | ||
const result = parser(body, '你好') | ||
|
||
expect(result).to.have.property('translations') | ||
expect(result.translations).to.be.an('array') | ||
|
||
const output = format(result) | ||
expect(output).to.be.a('string') | ||
expect(output).to.include('翻译') | ||
done() | ||
}) | ||
}) |