Skip to content

Commit

Permalink
Documents search
Browse files Browse the repository at this point in the history
  • Loading branch information
Unbewohnte committed Jan 26, 2023
1 parent 00bc33d commit fd484c6
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 12 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ There are some special `query` values:
- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
- `videos` - find and fetch files that look like videos
- `audio` - find and fetch files that look like audio
- `everything` - find and fetch images, audio and video
- `documents` - find and fetch files that look like a document
- `everything` - find and fetch images, audio, video, documents and email addresses

When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.

Expand Down
10 changes: 6 additions & 4 deletions src/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,16 @@ const (
QueryVideos string = "videos"
QueryAudio string = "audio"
QueryEmail string = "email"
QueryDocuments string = "documents"
QueryEverything string = "everything"
)

const (
SavePagesDir string = "pages"
SaveImagesDir string = "images"
SaveVideosDir string = "videos"
SaveAudioDir string = "audio"
SavePagesDir string = "pages"
SaveImagesDir string = "images"
SaveVideosDir string = "videos"
SaveAudioDir string = "audio"
SaveDocumentsDir string = "documents"
)

type Search struct {
Expand Down
17 changes: 15 additions & 2 deletions src/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import (
"unbewohnte/wecr/worker"
)

const version = "v0.2.4"
const version = "v0.2.5"

const (
defaultConfigFile string = "conf.json"
Expand Down Expand Up @@ -295,6 +295,12 @@ func main() {
return
}

err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveDocumentsDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for documents: %s", err)
return
}

switch conf.Search.Query {
case config.QueryEmail:
logger.Info("Looking for email addresses")
Expand All @@ -304,8 +310,15 @@ func main() {
logger.Info("Looking for videos (%+s)", web.VideoExtentions)
case config.QueryAudio:
logger.Info("Looking for audio (%+s)", web.AudioExtentions)
case config.QueryDocuments:
logger.Info("Looking for documents (%+s)", web.DocumentExtentions)
case config.QueryEverything:
logger.Info("Looking for email addresses, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions)
logger.Info("Looking for email addresses, images, videos, audio and various documents (%+s - %+s - %+s - %+s)",
web.ImageExtentions,
web.VideoExtentions,
web.AudioExtentions,
web.DocumentExtentions,
)
default:
if conf.Search.IsRegexp {
logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
Expand Down
100 changes: 100 additions & 0 deletions src/web/documents.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package web

import (
"net/url"
"strings"
)

func HasDocumentExtention(url string) bool {
for _, extention := range DocumentExtentions {
if strings.HasSuffix(url, extention) {
return true
}
}

return false
}

// Tries to find docs' URLs on the page
func FindPageDocuments(pageBody []byte, from *url.URL) []string {
var urls []string

// for every element that has "src" attribute
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int

linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}

linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}

if linkEndIndex <= linkStartIndex+1 {
continue
}

link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}

linkResolved := ResolveLink(link, from.Host)
if HasDocumentExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}

// for every "a" element as well
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int

linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}

linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}

if linkEndIndex <= linkStartIndex+1 {
continue
}

link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}

linkResolved := ResolveLink(link, from.Host)
if HasDocumentExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}

// return discovered doc urls
return urls
}
53 changes: 53 additions & 0 deletions src/web/extentions.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,56 @@ var VideoExtentions = []string{
".vob",
".ogv",
}

var DocumentExtentions = []string{
".pdf",
".doc",
".docx",
".epub",
".fb2",
".pub",
".ppt",
".pptx",
".txt",
".tex",
".odt",
".bib",
".ps",
".dwg",
".lyx",
".key",
".ott",
".odf",
".odc",
".ppg",
".xlc",
".latex",
".c",
".cpp",
".sh",
".go",
".java",
".cs",
".rs",
".lua",
".php",
".py",
".pl",
".lua",
".kt",
".js",
".rb",
".asm",
".rar",
".tar",
".db",
".7z",
".zip",
".gbr",
".tex",
".ttf",
".ttc",
".woff",
".otf",
".exif",
}
2 changes: 1 addition & 1 deletion src/web/images.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
continue
}

link, err := url.Parse(match)
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
Expand Down
25 changes: 21 additions & 4 deletions src/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ func (w *Worker) saveContent(links []string, pageURL *url.URL) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
} else if web.HasAudioExtention(link) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
} else if web.HasDocumentExtention(link) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveDocumentsDir, fileName)
} else {
filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
}
Expand Down Expand Up @@ -146,11 +148,16 @@ func (w *Worker) Work() {
if w.Conf.VisitQueue.VisitQueue != nil {
w.Conf.VisitQueue.Lock.Lock()
newJob, err := queue.PopLastJob(w.Conf.VisitQueue.VisitQueue)
if err != nil || newJob == nil {
if err != nil {
logger.Error("Failed to get a new job from visit queue: %s", err)
w.Conf.VisitQueue.Lock.Unlock()
continue
}
if newJob == nil {
w.Conf.VisitQueue.Lock.Unlock()
continue
}

job = *newJob
w.Conf.VisitQueue.Lock.Unlock()
} else {
Expand Down Expand Up @@ -276,26 +283,35 @@ func (w *Worker) Work() {
case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, pageURL)
w.saveContent(imageLinks, pageURL)
if len(imageLinks) > 0 {
w.saveContent(imageLinks, pageURL)
savePage = true
}

case config.QueryVideos:
// search for videos
// find video URLs, output videos to the files while not saving already outputted ones
videoLinks := web.FindPageVideos(pageData, pageURL)
w.saveContent(videoLinks, pageURL)
if len(videoLinks) > 0 {
w.saveContent(videoLinks, pageURL)
savePage = true
}

case config.QueryAudio:
// search for audio
// find audio URLs, output audio to the file while not saving already outputted ones
audioLinks := web.FindPageAudio(pageData, pageURL)
w.saveContent(audioLinks, pageURL)
if len(audioLinks) > 0 {
w.saveContent(audioLinks, pageURL)
savePage = true
}

case config.QueryDocuments:
// search for various documents
// find documents URLs, output docs to the file while not saving already outputted ones
docsLinks := web.FindPageDocuments(pageData, pageURL)
if len(docsLinks) > 0 {
w.saveContent(docsLinks, pageURL)
savePage = true
}

Expand All @@ -320,6 +336,7 @@ func (w *Worker) Work() {
contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...)
w.saveContent(contentLinks, pageURL)

// email
Expand Down

0 comments on commit fd484c6

Please sign in to comment.