Skip to content

Commit

Permalink
fix: used a hashset for large website check
Browse files Browse the repository at this point in the history
  • Loading branch information
Acollie committed May 9, 2024
1 parent aa78c3a commit 45956ca
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 15 deletions.
23 changes: 16 additions & 7 deletions formating/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,23 @@ func removeMailTo(links []string) []string {
}

func removeLargeWebSites(links []string) []string {
largeWebsites := []string{"facebook.com", "twitter.com", "instagram.com", "youtube.com", "linkedin.com", "pinterest.com", "tumblr.com", "reddit.com", "snapchat.com", "whatsapp.com", "quora.com", "flickr.com", "vimeo.com", "medium.com", "vk.com", "soundcloud.com"}
largeWebsites := map[string]struct{}{
"facebook.com": {}, "twitter.com": {}, "instagram.com": {}, "youtube.com": {},
"linkedin.com": {}, "pinterest.com": {}, "tumblr.com": {}, "reddit.com": {},
"snapchat.com": {}, "whatsapp.com": {}, "quora.com": {}, "flickr.com": {},
"vimeo.com": {}, "medium.com": {}, "vk.com": {}, "soundcloud.com": {},
}

for _, link := range links {
for _, website := range largeWebsites {
if strings.Contains(link, website) {
continue
}
var result []string
for i, link := range links {
url, _ := url.Parse(link)
if _, ok := largeWebsites[url.Hostname()]; ok {
continue
}
result = append(result, links[i])
if len(result) == 100 {
break
}
}
return links
return result
}
17 changes: 10 additions & 7 deletions handler/type.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ package handler

import (
"context"
"webcrawler/config"
"webcrawler/dynamoDBx"
"webcrawler/graphx"
"webcrawler/queue"
"webcrawler/site"
)

type Server struct {
Queue *queue.Handler
Db *dynamoDBx.DB
Graph *graphx.Graph
Queue *queue.Handler
Db *dynamoDBx.DB
Graph *graphx.Graph
Config *config.IgnoreList
}

type DBi interface {
Expand All @@ -24,11 +26,12 @@ type DBi interface {
UpdateWebsite(context.Context, site.Page, site.Website) error
}

func New(db *dynamoDBx.DB, queue *queue.Handler, graph *graphx.Graph) Server {
func New(db *dynamoDBx.DB, queue *queue.Handler, graph *graphx.Graph, config *config.IgnoreList) Server {
return Server{
Db: db,
Queue: queue,
Graph: graph,
Db: db,
Queue: queue,
Graph: graph,
Config: config,
}

}
4 changes: 3 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"log"
"os"
"webcrawler/awsx"
localConfig "webcrawler/config"
"webcrawler/dynamoDBx"
"webcrawler/graphx"
"webcrawler/handler"
Expand Down Expand Up @@ -42,8 +43,9 @@ func main() {
log.Fatalf("Cannot connect to the graph database: %s", err)
}
graph := graphx.New(graphConn)
config := localConfig.Fetch()

server := handler.New(dbClient, sqsClient, graph)
server := handler.New(dbClient, sqsClient, graph, config)

initialLinks := []string{
"https://blog.alexcollie.com/",
Expand Down

0 comments on commit 45956ca

Please sign in to comment.