-
Notifications
You must be signed in to change notification settings - Fork 0
/
ParseWaterdeepNewsArticles.main.kts
96 lines (82 loc) · 2.63 KB
/
ParseWaterdeepNewsArticles.main.kts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
@file:DependsOn("it.skrape:skrapeit:1.2.2")
import it.skrape.core.htmlDocument
import it.skrape.selects.eachHref
import it.skrape.selects.html5.*
import java.io.BufferedReader
import java.io.File
import java.io.InputStreamReader
import java.net.HttpURLConnection
import java.net.URL
println("Starting")
main()
fun main() {
val folder = File("/Users/slehrbaum/Library/CloudStorage/OneDrive-Personal/My_DND5e_Campaign/Places/Towns/Waterdeep/News")
val links = queryMainPageForLinks()
links.forEach { url ->
try {
val article = parseArticle(url)
val target = File(folder, article.first + ".md")
target.writeText(article.second)
} catch (e: Exception) {
println("Failed for $url with $e")
}
}
}
fun queryMainPageForLinks(): List<String> {
val links = mutableListOf<String>()
val document = getWebsite("https://rpg.nobl.ca/archive.php?x=dnd/archfr/wdn")
htmlDocument(document) {
a {
withClass = "serieslink"
findAll {
links += eachHref
}
}
}
return links.filter { !it.endsWith("zip") }.map { "https://rpg.nobl.ca$it" }
}
fun parseArticle(url: String): Pair<String, String> {
val document = getWebsite(url)
var title = ""
val contents = mutableListOf<String>()
htmlDocument(document) {
td {
withAttributes = listOf("valign" to "top", "width" to "100%")
b {
findFirst {
title = text
}
}
p {
findAll {
forEach {
contents += it.text
}
}
}
}
}
val filteredContents = contents.takeWhile { it.isNotBlank() }.dropWhile { it == title }
val bodyMarkdown = filteredContents.joinToString(separator = "\n\n")
return title to bodyMarkdown
}
/**
* Skrape.it webclient has some trouble in the script. so I fetch the website manually
*/
fun getWebsite(url: String): String {
val urlObj = URL(url)
val connection = urlObj.openConnection() as HttpURLConnection
connection.requestMethod = "GET"
val responseCode = connection.responseCode
if (responseCode == HttpURLConnection.HTTP_OK) {
val inputStream = connection.inputStream
val reader = BufferedReader(InputStreamReader(inputStream))
val response = reader.readText()
reader.close()
connection.disconnect()
return response
} else {
connection.disconnect()
throw Exception("Failed to connect. Response code: $responseCode")
}
}