From a2b2fb9596dc74f65e7cfb7cc1d6e88f4607fcdf Mon Sep 17 00:00:00 2001 From: Manasseh Mmadu Date: Fri, 3 Apr 2020 13:55:05 +0100 Subject: [PATCH] Async: Started Async Porting --- engine/besthdmovies.go | 23 +++++++-------- engine/engines.go | 64 +++++++++++++++++++++++++++++++----------- engine/fzmovies.go | 15 ++++++---- engine/netnaija.go | 18 ++++++------ main.go | 1 + 5 files changed, 79 insertions(+), 42 deletions(-) diff --git a/engine/besthdmovies.go b/engine/besthdmovies.go index 2fcd94d..f4d4580 100644 --- a/engine/besthdmovies.go +++ b/engine/besthdmovies.go @@ -57,9 +57,8 @@ func (engine *BestHDEngine) getParseAttrs() (string, string, error) { return "body", "article.latestPost", nil } -func (engine *BestHDEngine) parseSingleMovie(el *colly.HTMLElement, index int) (Movie, error) { +func (engine *BestHDEngine) parseSingleMovie(el *colly.HTMLElement) (Movie, error) { movie := Movie{ - Index: index, IsSeries: false, Source: engine.Name, } @@ -89,11 +88,11 @@ func (engine *BestHDEngine) parseSingleMovie(el *colly.HTMLElement, index int) ( return movie, nil } -func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collector, movies *[]Movie) { +func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie) { submissionDetails := make(map[string]string) // Update movie download link if div.post-single-content on page downloadCollector.OnHTML("div.post-single-content", func(e *colly.HTMLElement) { - movie := &(*movies)[getMovieIndexFromCtx(e.Request)] + movie := getMovieFromMovies(e.Request.URL.String(), movies) ptags := e.ChildTexts("p") if ptags[len(ptags)-3] >= ptags[len(ptags)-2] { movie.Description = strings.TrimSpace(ptags[len(ptags)-3]) @@ -120,7 +119,7 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect }) downloadCollector.OnHTML("div.content-area", func(e *colly.HTMLElement) { - movie := &(*movies)[getMovieIndexFromCtx(e.Request)] + movie := getMovieFromMovies(e.Request.URL.String(), movies) links := e.ChildAttrs("a", "href") for _, link := range links { if strings.HasPrefix(link, "https://zeefiles") || strings.HasPrefix(link, "http://zeefiles") { @@ -140,8 +139,7 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect }) downloadCollector.OnHTML("div.freeDownload", func(e *colly.HTMLElement) { - movieIndex := getMovieIndexFromCtx(e.Request) - movie := &(*movies)[movieIndex] + movie := getMovieFromMovies(e.Request.URL.String(), movies) zeesubmission := make(map[string]string) if e.ChildAttr("a.link_button", "href") != "" { downloadlink, err := url.Parse(e.ChildAttr("a.link_button", "href")) @@ -157,7 +155,7 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect zeesubmission[inputNames[index]] = inputValues[index] } - err := downloadCollector.Post(movie.DownloadLink.String(), zeesubmission) + err := downloadCollector.Post((*movie).DownloadLink.String(), zeesubmission) if err != nil { log.Fatal(err) } @@ -165,9 +163,8 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect }) downloadCollector.OnHTML("form[method=post]", func(e *colly.HTMLElement) { - movieIndex := getMovieIndexFromCtx(e.Request) var err error - movie := &(*movies)[movieIndex] + movie := getMovieFromMovies(e.Request.URL.String(), movies) downloadlink := movie.DownloadLink inputNames := e.ChildAttrs("input", "name") inputValues := e.ChildAttrs("input", "value") @@ -177,7 +174,8 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect } requestlink := e.Request.URL.String() if !(strings.HasPrefix(requestlink, "https://zeefiles") || strings.HasPrefix(requestlink, "http://zeefiles")) { - downloadlink, err = url.Parse("https://udown.me/watchonline/?movieIndex=" + strconv.Itoa(movieIndex)) + // TODO Dynamically assign movieIndex + downloadlink, err = url.Parse("https://udown.me/watchonline/?movieIndex=1") if err == nil { movie.DownloadLink = downloadlink } @@ -190,8 +188,7 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect downloadCollector.OnHTML("video", func(e *colly.HTMLElement) { downloadlink := e.ChildAttr("source", "src") - movieIndex := getMovieIndexFromCtx(e.Request) - movie := &(*movies)[movieIndex] + movie := getMovieFromMovies(e.Request.URL.String(), movies) movie.DownloadLink, _ = url.Parse(downloadlink) }) } diff --git a/engine/engines.go b/engine/engines.go index 5863928..54427a0 100644 --- a/engine/engines.go +++ b/engine/engines.go @@ -9,6 +9,7 @@ import ( "strings" "github.com/gocolly/colly/v2" + // "github.com/gocolly/colly/v2/debug" log "github.com/sirupsen/logrus" ) @@ -33,7 +34,7 @@ type Engine interface { List(page int) SearchResult String() string // parseSingleMovie: parses the result of a colly HTMLElement and returns a movie - parseSingleMovie(el *colly.HTMLElement, index int) (Movie, error) + parseSingleMovie(el *colly.HTMLElement) (Movie, error) // getParseAttrs : get the attributes to use to parse a returned soup // the first return string is the part of the html to be parsed e.g `body`, `main` @@ -42,7 +43,7 @@ type Engine interface { getParseAttrs() (string, string, error) // parseSingleMovie: parses the result of a colly HTMLElement and returns a movie - updateDownloadProps(downloadCollector *colly.Collector, movies *[]Movie) + updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie) } // Scrape : Parse queries a url and return results @@ -51,15 +52,16 @@ func Scrape(engine Engine) ([]Movie, error) { // Cache responses to prevent multiple download of pages // even if the collector is restarted colly.CacheDir("./gophie_cache"), + colly.Async(true), + // colly.Debugger(&debug.LogDebugger{}), ) // Another collector for download Links downloadLinkCollector := c.Clone() - movieIndex := 0 - var movies []Movie + var movies = make(map[string]*Movie) // Any Extras setup for downloads using can be specified in the function - engine.updateDownloadProps(downloadLinkCollector, &movies) + engine.updateDownloadProps(downloadLinkCollector, movies) main, article, err := engine.getParseAttrs() if err != nil { @@ -67,13 +69,13 @@ func Scrape(engine Engine) ([]Movie, error) { } c.OnHTML(main, func(e *colly.HTMLElement) { e.ForEach(article, func(_ int, el *colly.HTMLElement) { - movie, err := engine.parseSingleMovie(el, movieIndex) + movie, err := engine.parseSingleMovie(el) if err != nil { log.Errorf("%v could not be parsed", movie) } else { - movies = append(movies, movie) + // Using DownloadLink as key to movie makes it unique + movies[movie.DownloadLink.String()] = &movie downloadLinkCollector.Visit(movie.DownloadLink.String()) - movieIndex++ } }) }) @@ -92,11 +94,8 @@ func Scrape(engine Engine) ([]Movie, error) { // movie details when we need it downloadLinkCollector.OnRequest(func(r *colly.Request) { r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml") - for i, movie := range movies { - if movie.DownloadLink.String() == r.URL.String() { - log.Debugf("Retrieving Download Link %v\n", movie.DownloadLink) - r.Ctx.Put("movieIndex", strconv.Itoa(i)) - } + if movie, ok := movies[r.URL.String()]; ok { + log.Debugf("Retrieving Download Link %v\n", movie.DownloadLink) } }) @@ -110,11 +109,24 @@ func Scrape(engine Engine) ([]Movie, error) { }) downloadLinkCollector.OnResponse(func(r *colly.Response) { - movie := &movies[getMovieIndexFromCtx(r.Request)] - log.Debugf("Retrieved Download Link %v\n", movie.DownloadLink) + // movie := movies[r.Request.URL.String()] + // log.Infof("%s %v %s", r.Request.URL.String(), movie.DownloadLink, movie.Title) + // log.Debugf("Retrieved Download Link %v\n", movie.DownloadLink) }) + c.Visit(engine.getParseURL().String()) - return movies, nil + c.Wait() + downloadLinkCollector.Wait() + + // Create a List of Movies + v := make([]Movie, 0, len(movies)) + + for _, value := range movies { + v = append(v, *value) + } + prettyPrint(v) + + return v, nil } // Movie : the structure of all downloadable movies @@ -221,3 +233,23 @@ func getMovieIndexFromCtx(r *colly.Request) int { } return movieIndex } + +// Get Movie from a URL +func getMovieFromMovies(url string, movies map[string]*Movie) *Movie { + if _, ok := movies[url]; ok { + return movies[url] + } + for _, movie := range movies { + if (*movie).DownloadLink.String() == url { + return movie + } + } + return &Movie{} +} + +func prettyPrint(s []Movie) { + b, err := json.MarshalIndent(s, "", " ") + if err == nil { + fmt.Println(string(b)) + } +} diff --git a/engine/fzmovies.go b/engine/fzmovies.go index 6bb8e81..5b77814 100644 --- a/engine/fzmovies.go +++ b/engine/fzmovies.go @@ -58,9 +58,8 @@ func (engine *FzEngine) getParseAttrs() (string, string, error) { return "body", "div.mainbox", nil } -func (engine *FzEngine) parseSingleMovie(el *colly.HTMLElement, index int) (Movie, error) { +func (engine *FzEngine) parseSingleMovie(el *colly.HTMLElement) (Movie, error) { movie := Movie{ - Index: index, IsSeries: false, Source: engine.Name, } @@ -71,6 +70,11 @@ func (engine *FzEngine) parseSingleMovie(el *colly.HTMLElement, index int) (Movi movie.CoverPhotoLink = cover.String() // Remove all Video: or Movie: Prefixes movie.UploadDate = strings.TrimSpace(el.ChildTexts("small")[1]) + // Update Year + year, err := strconv.Atoi(strings.TrimSpace(el.ChildTexts("small")[1])) + if err == nil { + movie.Year = year + } movie.Title = strings.TrimSuffix(strings.TrimSpace(el.ChildText("b")), "") movie.Description = strings.TrimSpace(el.ChildTexts("small")[3]) downloadLink, err := url.Parse(el.Request.AbsoluteURL(el.ChildAttr("a", "href"))) @@ -85,10 +89,10 @@ func (engine *FzEngine) parseSingleMovie(el *colly.HTMLElement, index int) (Movi return movie, nil } -func (engine *FzEngine) updateDownloadProps(downloadCollector *colly.Collector, movies *[]Movie) { +func (engine *FzEngine) updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie) { // Update movie download link if ul.downloadlinks on page downloadCollector.OnHTML("ul.moviesfiles", func(e *colly.HTMLElement) { - movie := &(*movies)[getMovieIndexFromCtx(e.Request)] + movie := getMovieFromMovies(e.Request.URL.String(), movies) link := strings.Replace(e.ChildAttr("a", "href"), "download1.php", "download.php", 1) downloadLink, err := url.Parse(e.Request.AbsoluteURL(link + "&pt=jRGarGzOo2")) // downloadLink, err := url.Parse(e.ChildAttr("a", "href") + "&pt=jRGarGzOo2") @@ -109,7 +113,8 @@ func (engine *FzEngine) updateDownloadProps(downloadCollector *colly.Collector, if err != nil { log.Fatal(err) } - (*movies)[getMovieIndexFromCtx(e.Request)].DownloadLink = downloadLink + movie := getMovieFromMovies(e.Request.URL.String(), movies) + movie.DownloadLink = downloadLink } }) } diff --git a/engine/netnaija.go b/engine/netnaija.go index 279a77c..9a75316 100644 --- a/engine/netnaija.go +++ b/engine/netnaija.go @@ -69,7 +69,7 @@ func (engine *NetNaijaEngine) getParseAttrs() (string, string, error) { return "main", article, nil } -func (engine *NetNaijaEngine) parseSingleMovie(el *colly.HTMLElement, index int) (Movie, error) { +func (engine *NetNaijaEngine) parseSingleMovie(el *colly.HTMLElement) (Movie, error) { // movie title identifier var title string if title = "h3.file-name"; engine.mode == SearchMode { @@ -78,7 +78,6 @@ func (engine *NetNaijaEngine) parseSingleMovie(el *colly.HTMLElement, index int) re := regexp.MustCompile(`\((.*)\)`) movie := Movie{ - Index: index, IsSeries: false, Source: engine.Name, } @@ -115,10 +114,11 @@ func (engine *NetNaijaEngine) parseSingleMovie(el *colly.HTMLElement, index int) return movie, nil } -func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Collector, movies *[]Movie) { +func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie) { // Update movie size downloadCollector.OnHTML("button[id=download-button]", func(e *colly.HTMLElement) { - (*movies)[getMovieIndexFromCtx(e.Request)].Size = strings.TrimSpace(e.ChildText("span.size")) + movie := getMovieFromMovies(e.Request.URL.String(), movies) + movie.Size = strings.TrimSpace(e.ChildText("span.size")) }) downloadCollector.OnHTML("h3.file-name", func(e *colly.HTMLElement) { @@ -126,13 +126,14 @@ func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Colle if err != nil { log.Fatal(err) } - (*movies)[getMovieIndexFromCtx(e.Request)].DownloadLink = downloadLink + movie := getMovieFromMovies(e.Request.URL.String(), movies) + movie.DownloadLink = downloadLink downloadCollector.Visit(downloadLink.String()) }) // Update movie download link if a[id=download] on page downloadCollector.OnHTML("a[id=download]", func(e *colly.HTMLElement) { - movie := &((*movies)[getMovieIndexFromCtx(e.Request)]) + movie := getMovieFromMovies(e.Request.URL.String(), movies) movie.Size = strings.TrimSpace(e.ChildText("span[id=download-size]")) downloadLink, err := url.Parse(e.Attr("href")) if err != nil { @@ -148,13 +149,14 @@ func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Colle if err != nil { log.Fatal(err) } - (*movies)[getMovieIndexFromCtx(e.Request)].DownloadLink = downloadLink + movie := getMovieFromMovies(e.Request.URL.String(), movies) + movie.DownloadLink = downloadLink } }) //for series or parts downloadCollector.OnHTML("div.video-series-latest-episodes", func(inn *colly.HTMLElement) { - movie := &((*movies)[getMovieIndexFromCtx(inn.Request)]) + movie := getMovieFromMovies(inn.Request.URL.String(), movies) movie.IsSeries = true inn.ForEach("a", func(_ int, e *colly.HTMLElement) { downloadLink, err := url.Parse(e.Attr("href")) diff --git a/main.go b/main.go index 5d2350a..8b3a1d0 100644 --- a/main.go +++ b/main.go @@ -5,5 +5,6 @@ import ( ) func main() { + cmd.Execute() }