Skip to content

Commit

Permalink
Use Ksoup for parsing feed posts content
Browse files Browse the repository at this point in the history
This would allow us to properly fetch the text without HTML tags and also get image url if any in case one is not present as a tag in the feed item and instead present as HTML tag in the description.
  • Loading branch information
msasikanth committed Aug 14, 2023
1 parent 8c7af3a commit d6a590f
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 78 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@
package dev.sasikanth.rss.reader.network

import android.net.Uri
import android.util.Xml
import com.mohamedrejeb.ksoup.html.parser.KsoupHtmlParser
import dev.sasikanth.rss.reader.models.FeedPayload
import dev.sasikanth.rss.reader.models.PostPayload
import io.github.aakira.napier.Napier
import java.time.ZonedDateTime
import java.time.format.DateTimeFormatter
import org.xmlpull.v1.XmlPullParser
import org.xmlpull.v1.XmlPullParserException

internal class AndroidAtomParser(private val parser: XmlPullParser, private val feedUrl: String) :
Parser() {
Expand Down Expand Up @@ -85,13 +84,16 @@ internal class AndroidAtomParser(private val parser: XmlPullParser, private val
"title" -> title = readTagText(tagName, parser)
"link" -> link = readAtomLink(parser)
"content" -> {
val atomContent = readAtomContent(tagName, parser)
if (content.isNullOrBlank()) {
content = atomContent.content
}
if (image.isNullOrBlank()) {
image = atomContent.imageUrl
}
val rawContent = readTagText(tagName, parser)
val contentParser =
KsoupHtmlParser(
handler =
HtmlContentParser {
if (image.isNullOrBlank()) image = it.imageUrl
content = it.content.ifBlank { rawContent.trim() }
},
)
contentParser.parseComplete(rawContent)
}
"published" -> date = readTagText(tagName, parser)
else -> skip(parser)
Expand All @@ -118,57 +120,6 @@ internal class AndroidAtomParser(private val parser: XmlPullParser, private val
)
}

private fun readAtomContent(tagName: String, parser: XmlPullParser): AtomContent {
parser.require(XmlPullParser.START_TAG, namespace, tagName)

val rawContent = readTagText(tagName, parser)
val contentBuilder = StringBuilder()
var imageUrl: String? = null

try {
val contentParser =
Xml.newPullParser().apply { setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, false) }
contentParser.setInput(rawContent.reader())

var parsedContent = false
var currentEventType = contentParser.eventType
var currentTag: String? = null

while (!parsedContent && currentEventType != XmlPullParser.END_DOCUMENT) {
when (currentEventType) {
XmlPullParser.START_TAG -> {
currentTag = contentParser.name
if (currentTag == "img") {
imageUrl = contentParser.getAttributeValue(namespace, "src")
}
}
XmlPullParser.TEXT -> {
val text = contentParser.text.trim()
when {
text.isNotBlank() &&
(currentTag == "p" ||
currentTag == "a" ||
currentTag == "span" ||
currentTag == "em") -> {
contentBuilder.append("$text ")
}
}
}
XmlPullParser.END_TAG -> {
if (contentParser.name == tagName) {
parsedContent = true
}
}
}
currentEventType = contentParser.next()
}
} catch (e: XmlPullParserException) {
contentBuilder.append(rawContent)
}

return AtomContent(imageUrl = imageUrl, content = contentBuilder.toString())
}

private fun readAtomLink(parser: XmlPullParser): String? {
var link: String? = null
parser.require(XmlPullParser.START_TAG, namespace, "link")
Expand All @@ -181,5 +132,3 @@ internal class AndroidAtomParser(private val parser: XmlPullParser, private val
return link
}
}

private data class AtomContent(val imageUrl: String?, val content: String)
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package dev.sasikanth.rss.reader.network

import android.net.Uri
import com.mohamedrejeb.ksoup.html.parser.KsoupHtmlParser
import dev.sasikanth.rss.reader.models.FeedPayload
import dev.sasikanth.rss.reader.models.PostPayload
import io.github.aakira.napier.Napier
Expand Down Expand Up @@ -108,6 +109,17 @@ internal class AndroidRssParser(private val parser: XmlPullParser, private val f
}
?: System.currentTimeMillis()

val contentParser =
KsoupHtmlParser(
handler =
HtmlContentParser {
if (image.isNullOrBlank()) image = it.imageUrl
description = it.content.ifBlank { description?.trim() }
},
)

contentParser.parseComplete(description.orEmpty())

return PostPayload(
title = FeedParser.cleanText(title).orEmpty(),
link = FeedParser.cleanText(link).orEmpty(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ package dev.sasikanth.rss.reader.network

import com.mohamedrejeb.ksoup.html.parser.KsoupHtmlHandler

class IOSAtomContentParser(private val onEnd: (AtomContent) -> Unit) : KsoupHtmlHandler {
class HtmlContentParser(private val onEnd: (HtmlContent) -> Unit) : KsoupHtmlHandler {

private val currentData: MutableMap<String, String> = mutableMapOf()
private var currentTag: String? = null
Expand All @@ -44,7 +44,7 @@ class IOSAtomContentParser(private val onEnd: (AtomContent) -> Unit) : KsoupHtml

override fun onEnd() {
onEnd(
AtomContent(
HtmlContent(
imageUrl = currentData["imageUrl"],
content = currentData["content"].orEmpty().trim()
)
Expand All @@ -53,4 +53,4 @@ class IOSAtomContentParser(private val onEnd: (AtomContent) -> Unit) : KsoupHtml
}
}

data class AtomContent(val imageUrl: String?, val content: String)
data class HtmlContent(val imageUrl: String?, val content: String)
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/
package dev.sasikanth.rss.reader.network

import com.mohamedrejeb.ksoup.html.parser.KsoupHtmlOptions
import com.mohamedrejeb.ksoup.html.parser.KsoupHtmlParser
import dev.sasikanth.rss.reader.models.FeedPayload
import dev.sasikanth.rss.reader.models.PostPayload
Expand All @@ -36,17 +35,10 @@ internal fun PostPayload.Companion.mapAtomPost(atomMap: Map<String, String>): Po
val parser =
KsoupHtmlParser(
handler =
IOSAtomContentParser {
if (imageUrl.isNullOrBlank()) {
imageUrl = it.imageUrl
}

content = it.content
HtmlContentParser {
if (imageUrl.isNullOrBlank()) imageUrl = it.imageUrl
content = it.content.ifBlank { data?.trim() }
},
options =
KsoupHtmlOptions(
xmlMode = true,
)
)

parser.parseComplete(data.orEmpty())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
package dev.sasikanth.rss.reader.network

import com.mohamedrejeb.ksoup.html.parser.KsoupHtmlParser
import dev.sasikanth.rss.reader.models.FeedPayload
import dev.sasikanth.rss.reader.models.PostPayload
import io.github.aakira.napier.Napier
Expand All @@ -30,8 +31,19 @@ private val abbrevTimezoneDateFormatter =
internal fun PostPayload.Companion.mapRssPost(rssMap: Map<String, String>): PostPayload {
val pubDate = rssMap["pubDate"]
val link = rssMap["link"]
val description = rssMap["description"]
val imageUrl: String? = rssMap["imageUrl"]
var description = rssMap["description"]
var imageUrl: String? = rssMap["imageUrl"]

val contentParser =
KsoupHtmlParser(
handler =
HtmlContentParser {
if (imageUrl.isNullOrBlank()) imageUrl = it.imageUrl
description = it.content.ifBlank { description?.trim() }
},
)

contentParser.parseComplete(description.orEmpty())

return PostPayload(
title = FeedParser.cleanText(rssMap["title"])!!,
Expand Down

0 comments on commit d6a590f

Please sign in to comment.