- 李笑来的自学教程
import re
# from bs4 import BeautifulSoup
r = open('./Knight2.html', 'r')
# print(r)
# for line in r: # 打开文件
# rs = line.replace('\n', '') # 替换换行符
# print(rs) # 显示替换后的行
# print(r)
# print('name', r.name)
html = r.read()
# print(s)
r.close()
# html = html.replace("\r\n", "")
html = re.sub('\s', '', html)
# print(html)
# pttn = 'class="question">([\d].+[【】()()?/\,“”、。:\u4e00-\u9fa5]+)'
# pttn = '(?<=>)[^\u4e00-\u9fa5]+(?=<)'
pttn = r'class="question">([\w\.【】《》()\(\)?\\,“”、。:\u4e00-\u9fa5]+)[/\s\w"<>=\/\-_\d]+([A|B|C|D|E|F|G|H|I|正确|错误][\\()【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'
# pttn = 'class="question">([\w\.【】()()?/\,“”、。:\u4e00-\u9fa5]+)</div>[/\s\w"<>=\/\-_\d]+([A|正确][//\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'
re.findall(pttn, html)
# sp = BeautifulSoup(html, 'html.parser')
# div = sp.select('#app > div > div > div.hw > div .item')
# print(div)
from lxml import etree
tree = etree.parse("./Knight2.html")
print(tree)
OSError Traceback (most recent call last)
/var/folders/w6/9k4dzqlj617f06dfby_vk1pr0000gn/T/ipykernel_18599/3028685901.py in 1 from lxml import etree 2 ----> 3 tree = etree.parse("./Knight2.html") 4 5 print(tree)
src/lxml/etree.pyx in lxml.etree.parse()
src/lxml/parser.pxi in lxml.etree._parseDocument()
src/lxml/parser.pxi in lxml.etree._parseDocumentFromURL()
src/lxml/parser.pxi in lxml.etree._parseDocFromFile()
src/lxml/parser.pxi in lxml.etree._BaseParser._parseDocFromFile()
src/lxml/parser.pxi in lxml.etree._ParserContext._handleParseResultDoc()
src/lxml/parser.pxi in lxml.etree._handleParseResult()
src/lxml/parser.pxi in lxml.etree._raiseParseError()
OSError: Error reading file './Knight2.html': failed to load external entity "./Knight2.html"
- 李笑来的自学教程
import re
# from bs4 import BeautifulSoup
r = open('./Knight2.html', 'r')
# print(r)
# for line in r: # 打开文件
# rs = line.replace('\n', '') # 替换换行符
# print(rs) # 显示替换后的行
# print(r)
# print('name', r.name)
html = r.read()
# print(s)
r.close()
# html = html.replace("\r\n", "")
html = re.sub('\s', '', html)
# print(html)
# pttn = 'class="question">([\d].+[【】()()?/\,“”、。:\u4e00-\u9fa5]+)'
# pttn = '(?<=>)[^\u4e00-\u9fa5]+(?=<)'
pttn = r'class="question">([\w\.【】《》()\(\)?\\,“”、。:\u4e00-\u9fa5]+)[/\s\w"<>=\/\-_\d]+([A|B|C|D|E|F|G|H|I|正确|错误][\\()【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'
# pttn = 'class="question">([\w\.【】()()?/\,“”、。:\u4e00-\u9fa5]+)</div>[/\s\w"<>=\/\-_\d]+([A|正确][//\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'
re.findall(pttn, html)
# sp = BeautifulSoup(html, 'html.parser')
# div = sp.select('#app > div > div > div.hw > div .item')
# print(div)
from lxml import etree
tree = etree.parse("./night2.html")
print(tree)