Skip to content

Commit

Permalink
Async: Use Mutex to prevent data race amongst various goroutines
Browse files Browse the repository at this point in the history
  • Loading branch information
MeNsaaH committed Apr 6, 2020
1 parent ed2ef64 commit 9e039d5
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 32 deletions.
20 changes: 14 additions & 6 deletions engine/besthdmovies.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@ func (engine *BestHDEngine) parseSingleMovie(el *colly.HTMLElement, movieIndex i
return movie, nil
}

func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie) {
func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collector, scrapedMovies *scraped) {
submissionDetails := make(map[string]string)
// Update movie download link if div.post-single-content on page
downloadCollector.OnHTML("div.post-single-content", func(e *colly.HTMLElement) {
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
ptags := e.ChildTexts("p")
if ptags[len(ptags)-3] >= ptags[len(ptags)-2] {
movie.Description = strings.TrimSpace(ptags[len(ptags)-3])
Expand All @@ -120,7 +122,7 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect
})

downloadCollector.OnHTML("div.content-area", func(e *colly.HTMLElement) {
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
links := e.ChildAttrs("a", "href")
for _, link := range links {
if strings.HasPrefix(link, "https://zeefiles") || strings.HasPrefix(link, "http://zeefiles") {
Expand All @@ -130,6 +132,8 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect
}
downloadlink, err := url.Parse(link)
if err == nil {
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.DownloadLink = downloadlink
downloadCollector.Visit(downloadlink.String())
} else {
Expand All @@ -140,11 +144,13 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect
})

downloadCollector.OnHTML("div.freeDownload", func(e *colly.HTMLElement) {
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
zeesubmission := make(map[string]string)
if e.ChildAttr("a.link_button", "href") != "" {
downloadlink, err := url.Parse(e.ChildAttr("a.link_button", "href"))
if err == nil {
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.DownloadLink = downloadlink
}
} else {
Expand All @@ -165,7 +171,7 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect

downloadCollector.OnHTML("form[method=post]", func(e *colly.HTMLElement) {
var err error
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
downloadlink := movie.DownloadLink
inputNames := e.ChildAttrs("input", "name")
inputValues := e.ChildAttrs("input", "value")
Expand All @@ -188,7 +194,9 @@ func (engine *BestHDEngine) updateDownloadProps(downloadCollector *colly.Collect

downloadCollector.OnHTML("video", func(e *colly.HTMLElement) {
downloadlink := e.ChildAttr("source", "src")
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.DownloadLink, _ = url.Parse(downloadlink)
})
}
Expand Down
43 changes: 26 additions & 17 deletions engine/engines.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/url"
"strconv"
"strings"
"sync"

"github.com/gocolly/colly/v2"
// "github.com/gocolly/colly/v2/debug"
Expand Down Expand Up @@ -43,7 +44,14 @@ type Engine interface {
getParseAttrs() (string, string, error)

// parseSingleMovie: parses the result of a colly HTMLElement and returns a movie
updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie)
updateDownloadProps(downloadCollector *colly.Collector, scrapedMovies *scraped)
}

// All scraped movies are stored here. Since accessed on different goroutine
// Mutex to prevent Data Race
type scraped struct {
movies map[string]*Movie
sync.Mutex
}

// Scrape : Parse queries a url and return results
Expand All @@ -58,10 +66,10 @@ func Scrape(engine Engine) ([]Movie, error) {
// Another collector for download Links
downloadLinkCollector := c.Clone()

var movies = make(map[string]*Movie)
scrapedMovies := scraped{movies: make(map[string]*Movie)}

// Any Extras setup for downloads using can be specified in the function
engine.updateDownloadProps(downloadLinkCollector, movies)
engine.updateDownloadProps(downloadLinkCollector, &scrapedMovies)

main, article, err := engine.getParseAttrs()
if err != nil {
Expand All @@ -75,8 +83,10 @@ func Scrape(engine Engine) ([]Movie, error) {
log.Errorf("%v could not be parsed", movie)
} else {
// Using DownloadLink as key to movie makes it unique
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
m := strconv.Itoa(movieIndex)
movies[m] = &movie
scrapedMovies.movies[m] = &movie
ctx := colly.NewContext()
ctx.Put("movieIndex", m)
downloadLinkCollector.Request("GET", movie.DownloadLink.String(), nil, ctx, nil)
Expand All @@ -99,35 +109,32 @@ func Scrape(engine Engine) ([]Movie, error) {
// movie details when we need it
downloadLinkCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml")
movie := getMovieFromMovies(r, movies)
log.Debugf("Retrieving Download Link %v\n", movie.DownloadLink)
movie := getMovieFromMovies(r, &scrapedMovies)
log.Debugf("Retrieving Download Link %s\n", movie.Title)
})

// If Response Content Type is not Text, Abort the Request to prevent fully downloading the
// body in case of other types like mp4
downloadLinkCollector.OnResponseHeaders(func(r *colly.Response) {
log.Infof("%s", r.Headers)
if !strings.Contains(r.Headers.Get("Content-Type"), "text") {
log.Errorf("Response %s is not text/html. Aborting request", r.Request.URL)
log.Debugf("Response %s is not text/html. Aborting request", r.Request.URL)
r.Request.Abort()
}
})

downloadLinkCollector.OnResponse(func(r *colly.Response) {
movie := getMovieFromMovies(r.Request, movies)
log.Infof("Movie on Response %v", movie)
// prettyPrint([]Movie{*movie})
// log.Debugf("Retrieved Download Page %s\n", movie.DownloadLink.String())
movie := getMovieFromMovies(r.Request, &scrapedMovies)
log.Debugf("Retrieved Download Page %s\n", movie.Title)
})

c.Visit(engine.getParseURL().String())
c.Wait()
downloadLinkCollector.Wait()

// Create a List of Movies
v := make([]Movie, 0, len(movies))
v := make([]Movie, 0, len(scrapedMovies.movies))

for _, value := range movies {
for _, value := range scrapedMovies.movies {
v = append(v, *value)
}
prettyPrint(v)
Expand Down Expand Up @@ -241,10 +248,12 @@ func getMovieIndexFromCtx(r *colly.Request) int {
}

// Get Movie from a Context
func getMovieFromMovies(r *colly.Request, movies map[string]*Movie) *Movie {
func getMovieFromMovies(r *colly.Request, scrapedMovies *scraped) *Movie {
movieIndex := r.Ctx.Get("movieIndex")
if _, ok := movies[movieIndex]; ok {
return movies[movieIndex]
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
if _, ok := scrapedMovies.movies[movieIndex]; ok {
return scrapedMovies.movies[movieIndex]
}
return &Movie{}
}
Expand Down
11 changes: 8 additions & 3 deletions engine/fzmovies.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,19 @@ func (engine *FzEngine) parseSingleMovie(el *colly.HTMLElement, movieIndex int)
return movie, nil
}

func (engine *FzEngine) updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie) {
func (engine *FzEngine) updateDownloadProps(downloadCollector *colly.Collector, scrapedMovies *scraped) {
// Update movie download link if ul.downloadlinks on page
downloadCollector.OnHTML("ul.moviesfiles", func(e *colly.HTMLElement) {
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
link := strings.Replace(e.ChildAttr("a", "href"), "download1.php", "download.php", 1)
downloadLink, err := url.Parse(e.Request.AbsoluteURL(link + "&pt=jRGarGzOo2"))
// downloadLink, err := url.Parse(e.ChildAttr("a", "href") + "&pt=jRGarGzOo2")
if err != nil {
log.Fatal(err)
}

scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.DownloadLink = downloadLink
re := regexp.MustCompile(`(.* MB)`)
dl := strings.TrimPrefix(re.FindStringSubmatch(e.ChildText("dcounter"))[0], "(")
Expand All @@ -114,7 +117,9 @@ func (engine *FzEngine) updateDownloadProps(downloadCollector *colly.Collector,
if err != nil {
log.Fatal(err)
}
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.DownloadLink = downloadLink
}
})
Expand Down
22 changes: 16 additions & 6 deletions engine/netnaija.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ func (engine *NetNaijaEngine) parseSingleMovie(el *colly.HTMLElement, movieIndex
return movie, nil
}

func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Collector, movies map[string]*Movie) {
func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Collector, scrapedMovies *scraped) {
// Update movie size
downloadCollector.OnHTML("button[id=download-button]", func(e *colly.HTMLElement) {
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.Size = strings.TrimSpace(e.ChildText("span.size"))
})

Expand All @@ -127,14 +129,18 @@ func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Colle
if err != nil {
log.Fatal(err)
}
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.DownloadLink = downloadLink
downloadCollector.Visit(downloadLink.String())
})

// Update movie download link if a[id=download] on page
downloadCollector.OnHTML("a[id=download]", func(e *colly.HTMLElement) {
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.Size = strings.TrimSpace(e.ChildText("span[id=download-size]"))
downloadLink, err := url.Parse(e.Attr("href"))
if err != nil {
Expand All @@ -151,15 +157,19 @@ func (engine *NetNaijaEngine) updateDownloadProps(downloadCollector *colly.Colle
if err != nil {
log.Fatal(err)
}
movie := getMovieFromMovies(e.Request, movies)
movie := getMovieFromMovies(e.Request, scrapedMovies)
log.Infof("Parsing Downloads %s %s", movie.Title, downloadLink.String())
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.DownloadLink = downloadLink
}
})

//for series or parts
downloadCollector.OnHTML("div.video-series-latest-episodes", func(inn *colly.HTMLElement) {
movie := getMovieFromMovies(inn.Request, movies)
movie := getMovieFromMovies(inn.Request, scrapedMovies)
scrapedMovies.Lock()
defer scrapedMovies.Unlock()
movie.IsSeries = true
inn.ForEach("a", func(_ int, e *colly.HTMLElement) {
downloadLink, err := url.Parse(e.Attr("href"))
Expand Down

0 comments on commit 9e039d5

Please sign in to comment.