Skip to content

Latest commit

 

History

History
134 lines (85 loc) · 3.42 KB

正则学习与实践.md

File metadata and controls

134 lines (85 loc) · 3.42 KB

正则表达式的学习

参考

  • 李笑来的自学教程
import re
# from bs4 import BeautifulSoup


r = open('./Knight2.html', 'r')
# print(r)
# for line in r:  # 打开文件
#     rs = line.replace('\n', '')  # 替换换行符
#     print(rs)  # 显示替换后的行
# print(r)
# print('name', r.name)
html = r.read()
# print(s)
r.close()
# html = html.replace("\r\n", "")
html = re.sub('\s', '', html)
# print(html)
# pttn = 'class="question">([\d].+[【】()()?/\,“”、。:\u4e00-\u9fa5]+)'

# pttn = '(?<=>)[^\u4e00-\u9fa5]+(?=<)'
pttn = r'class="question">([\w\.【】《》()\(\)?\\,“”、。:\u4e00-\u9fa5]+)[/\s\w"<>=\/\-_\d]+([A|B|C|D|E|F|G|H|I|正确|错误][\\()【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'

# pttn = 'class="question">([\w\.【】()()?/\,“”、。:\u4e00-\u9fa5]+)</div>[/\s\w"<>=\/\-_\d]+([A|正确][//\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'


re.findall(pttn, html)



# sp = BeautifulSoup(html, 'html.parser')
# div = sp.select('#app > div > div > div.hw > div .item')
# print(div)
from lxml import etree

tree = etree.parse("./Knight2.html")

print(tree)

OSError Traceback (most recent call last)

/var/folders/w6/9k4dzqlj617f06dfby_vk1pr0000gn/T/ipykernel_18599/3028685901.py in 1 from lxml import etree 2 ----> 3 tree = etree.parse("./Knight2.html") 4 5 print(tree)

src/lxml/etree.pyx in lxml.etree.parse()

src/lxml/parser.pxi in lxml.etree._parseDocument()

src/lxml/parser.pxi in lxml.etree._parseDocumentFromURL()

src/lxml/parser.pxi in lxml.etree._parseDocFromFile()

src/lxml/parser.pxi in lxml.etree._BaseParser._parseDocFromFile()

src/lxml/parser.pxi in lxml.etree._ParserContext._handleParseResultDoc()

src/lxml/parser.pxi in lxml.etree._handleParseResult()

src/lxml/parser.pxi in lxml.etree._raiseParseError()

OSError: Error reading file './Knight2.html': failed to load external entity "./Knight2.html"

参考

  • 李笑来的自学教程
import re
# from bs4 import BeautifulSoup


r = open('./Knight2.html', 'r')
# print(r)
# for line in r:  # 打开文件
#     rs = line.replace('\n', '')  # 替换换行符
#     print(rs)  # 显示替换后的行
# print(r)
# print('name', r.name)
html = r.read()
# print(s)
r.close()
# html = html.replace("\r\n", "")
html = re.sub('\s', '', html)
# print(html)
# pttn = 'class="question">([\d].+[【】()()?/\,“”、。:\u4e00-\u9fa5]+)'

# pttn = '(?<=>)[^\u4e00-\u9fa5]+(?=<)'
pttn = r'class="question">([\w\.【】《》()\(\)?\\,“”、。:\u4e00-\u9fa5]+)[/\s\w"<>=\/\-_\d]+([A|B|C|D|E|F|G|H|I|正确|错误][\\()【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'

# pttn = 'class="question">([\w\.【】()()?/\,“”、。:\u4e00-\u9fa5]+)</div>[/\s\w"<>=\/\-_\d]+([A|正确][//\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)[!/\s\w"<>=\/\-_\d]+([B|错误][\(\)【】?\w,“”\s、。:\u4e00-\u9fa5]+)'


re.findall(pttn, html)



# sp = BeautifulSoup(html, 'html.parser')
# div = sp.select('#app > div > div > div.hw > div .item')
# print(div)
from lxml import etree

tree = etree.parse("./night2.html")

print(tree)