From 45956ca0d55ebd0956e7f66d61f7ae410281dcea Mon Sep 17 00:00:00 2001 From: acollie Date: Thu, 9 May 2024 22:06:11 +0100 Subject: [PATCH] fix: used a hashset for large website check --- formating/text.go | 23 ++++++++++++++++------- handler/type.go | 17 ++++++++++------- main.go | 4 +++- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/formating/text.go b/formating/text.go index 76f6985..00d5011 100644 --- a/formating/text.go +++ b/formating/text.go @@ -112,14 +112,23 @@ func removeMailTo(links []string) []string { } func removeLargeWebSites(links []string) []string { - largeWebsites := []string{"facebook.com", "twitter.com", "instagram.com", "youtube.com", "linkedin.com", "pinterest.com", "tumblr.com", "reddit.com", "snapchat.com", "whatsapp.com", "quora.com", "flickr.com", "vimeo.com", "medium.com", "vk.com", "soundcloud.com"} + largeWebsites := map[string]struct{}{ + "facebook.com": {}, "twitter.com": {}, "instagram.com": {}, "youtube.com": {}, + "linkedin.com": {}, "pinterest.com": {}, "tumblr.com": {}, "reddit.com": {}, + "snapchat.com": {}, "whatsapp.com": {}, "quora.com": {}, "flickr.com": {}, + "vimeo.com": {}, "medium.com": {}, "vk.com": {}, "soundcloud.com": {}, + } - for _, link := range links { - for _, website := range largeWebsites { - if strings.Contains(link, website) { - continue - } + var result []string + for i, link := range links { + url, _ := url.Parse(link) + if _, ok := largeWebsites[url.Hostname()]; ok { + continue + } + result = append(result, links[i]) + if len(result) == 100 { + break } } - return links + return result } diff --git a/handler/type.go b/handler/type.go index 208236b..035cc28 100644 --- a/handler/type.go +++ b/handler/type.go @@ -2,6 +2,7 @@ package handler import ( "context" + "webcrawler/config" "webcrawler/dynamoDBx" "webcrawler/graphx" "webcrawler/queue" @@ -9,9 +10,10 @@ import ( ) type Server struct { - Queue *queue.Handler - Db *dynamoDBx.DB - Graph *graphx.Graph + Queue *queue.Handler + Db *dynamoDBx.DB + Graph *graphx.Graph + Config *config.IgnoreList } type DBi interface { @@ -24,11 +26,12 @@ type DBi interface { UpdateWebsite(context.Context, site.Page, site.Website) error } -func New(db *dynamoDBx.DB, queue *queue.Handler, graph *graphx.Graph) Server { +func New(db *dynamoDBx.DB, queue *queue.Handler, graph *graphx.Graph, config *config.IgnoreList) Server { return Server{ - Db: db, - Queue: queue, - Graph: graph, + Db: db, + Queue: queue, + Graph: graph, + Config: config, } } diff --git a/main.go b/main.go index 3b921eb..0147443 100644 --- a/main.go +++ b/main.go @@ -6,6 +6,7 @@ import ( "log" "os" "webcrawler/awsx" + localConfig "webcrawler/config" "webcrawler/dynamoDBx" "webcrawler/graphx" "webcrawler/handler" @@ -42,8 +43,9 @@ func main() { log.Fatalf("Cannot connect to the graph database: %s", err) } graph := graphx.New(graphConn) + config := localConfig.Fetch() - server := handler.New(dbClient, sqsClient, graph) + server := handler.New(dbClient, sqsClient, graph, config) initialLinks := []string{ "https://blog.alexcollie.com/",