Skip to content

Commit

Permalink
Merge pull request #12 from buhe/fix_html_loader
Browse files Browse the repository at this point in the history
  • Loading branch information
buhe authored Sep 4, 2023
2 parents 9666232 + 6508776 commit 1110ad3
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 15 deletions.
49 changes: 34 additions & 15 deletions Sources/LangChain/document_loaders/HtmlLoader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,8 @@ public struct HtmlLoader: BaseLoader {
do {
let doc: SwiftSoup.Document = try SwiftSoup.parse(html)
let text = try doc.text()
var title = ""
do {
title = try doc.title()
} catch {
print("Get title error \(error)")
}
let thumbnail = findImage(text: html)
let title = findTitle(doc: doc)
let thumbnail = findImage(text: html, doc: doc)
let metadata: [String: String] = ["url": url, "title": title, "thumbnail": thumbnail]
return [Document(page_content: text, metadata: metadata)]
} catch Exception.Error( _, let message) {
Expand All @@ -36,19 +31,43 @@ public struct HtmlLoader: BaseLoader {
return []
}
}

func findImage(text: String) -> String {
func findTitle(doc: SwiftSoup.Document) -> String {
var title = ""
do {
//try get html -> header -> <meta property="twitter:title"
let titleOrNil = try doc.head()?.select("meta[property=twitter:title]").attr("content")
if titleOrNil != nil {
title = titleOrNil!
}

if title.isEmpty {
title = try doc.title()
}
} catch {
print("Get title error \(error)")
}
return title
}
func findImage(text: String, doc: SwiftSoup.Document) -> String {
// First, try get html -> header -> <meta property="twitter:image"
// Second, try string match.
let pattern = "(http|https)://[\\S]+?\\.(jpg|jpeg|png|gif)"

do {
// print(text)
let regex = try NSRegularExpression(pattern: pattern, options: .caseInsensitive)
let matches = regex.matches(in: text, options: [], range: NSRange(location: 0, length: text.utf16.count))
if matches.isEmpty {
return ""
} else {
return String(text[Range(matches.first!.range, in: text)!])
var thumbnail = ""
let thumbnailOrNil = try doc.head()?.select("meta[property=twitter:image]").attr("content")
if thumbnailOrNil != nil {
thumbnail = thumbnailOrNil!
}
if thumbnail.isEmpty {
let regex = try NSRegularExpression(pattern: pattern, options: .caseInsensitive)
let matches = regex.matches(in: text, options: [], range: NSRange(location: 0, length: text.utf16.count))
if !matches.isEmpty {
thumbnail = String(text[Range(matches.first!.range, in: text)!])
}
}
return thumbnail
} catch {
print("Error: \(error.localizedDescription)")
return ""
Expand Down
35 changes: 35 additions & 0 deletions Tests/LangChainTests/langchain_swiftTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -850,9 +850,12 @@ May God bless you all. May God protect our troops.
let loader = HtmlLoader(html: plain, url: url)
let doc = await loader.load()
print("thumbnail: \(doc.first!.metadata["thumbnail"]!)")
print("title: \(doc.first!.metadata["title"]!)")

XCTAssertFalse(doc.isEmpty)
XCTAssertNotEqual("", doc.first!.page_content)
XCTAssertNotEqual("", doc.first!.metadata["thumbnail"]!)
XCTAssertNotEqual("", doc.first!.metadata["title"]!)
}
} catch {
// handle error
Expand Down Expand Up @@ -983,6 +986,38 @@ May God bless you all. May God protect our troops.
}
}

func testHtmlLoaderForWX() async throws {
let url = "https://mp.weixin.qq.com/s/WPyNxKlaBuzFlJyYb2-Lpw"
let eventLoopGroup = MultiThreadedEventLoopGroup(numberOfThreads: 1)
let httpClient = HTTPClient(eventLoopGroupProvider: .shared(eventLoopGroup))
defer {
// it's important to shutdown the httpClient after all requests are done, even if one failed. See: https://github.com/swift-server/async-http-client
try? httpClient.syncShutdown()
}
do {
var request = HTTPClientRequest(url: url)
request.headers.add(name: "User-Agent", value: "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/115.0.5790.130 Mobile/15E148 Safari/604.1")
request.method = .GET

let response = try await httpClient.execute(request, timeout: .seconds(120))
print(response.headers)
if response.status == .ok {
let plain = String(buffer: try await response.body.collect(upTo: 10240 * 1024))
let loader = HtmlLoader(html: plain, url: url)
let doc = await loader.load()
print("thumbnail: \(doc.first!.metadata["thumbnail"]!)")
print("title: \(doc.first!.metadata["title"]!)")

XCTAssertFalse(doc.isEmpty)
XCTAssertNotEqual("", doc.first!.page_content)
XCTAssertNotEqual("", doc.first!.metadata["thumbnail"]!)
XCTAssertNotEqual("", doc.first!.metadata["title"]!)
}
} catch {
// handle error
print(error)
}
}
//
// func testYoutubeHackClientList() async throws {
// let eventLoopGroup = MultiThreadedEventLoopGroup(numberOfThreads: 1)
Expand Down

0 comments on commit 1110ad3

Please sign in to comment.