Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(twitter): Scraper Enhancements Account Rotation and Rate Limit Handling #576

Merged
merged 17 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ require (
github.com/spaolacci/murmur3 v1.1.0 // indirect
github.com/spf13/afero v1.11.0 // indirect
github.com/spf13/cast v1.6.0 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/supranational/blst v0.3.13 // indirect
github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 // indirect
Expand Down
1 change: 0 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,6 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
Expand Down
45 changes: 45 additions & 0 deletions pkg/scrapers/twitter/account.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package twitter

import (
"sync"
"time"
)

type TwitterAccount struct {
Username string
Password string
TwoFACode string
RateLimitedUntil time.Time
}

type TwitterAccountManager struct {
accounts []*TwitterAccount
index int
mutex sync.Mutex
}

func NewTwitterAccountManager(accounts []*TwitterAccount) *TwitterAccountManager {
return &TwitterAccountManager{
accounts: accounts,
index: 0,
}
}

func (manager *TwitterAccountManager) GetNextAccount() *TwitterAccount {
manager.mutex.Lock()
defer manager.mutex.Unlock()
for i := 0; i < len(manager.accounts); i++ {
account := manager.accounts[manager.index]
manager.index = (manager.index + 1) % len(manager.accounts)
if time.Now().After(account.RateLimitedUntil) {
return account
}
}
return nil
}

func (manager *TwitterAccountManager) MarkAccountRateLimited(account *TwitterAccount) {
manager.mutex.Lock()
defer manager.mutex.Unlock()
account.RateLimitedUntil = time.Now().Add(GetRateLimitDuration())
}
72 changes: 38 additions & 34 deletions pkg/scrapers/twitter/auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,53 @@ package twitter
import (
"fmt"

twitterscraper "github.com/masa-finance/masa-twitter-scraper"
"github.com/sirupsen/logrus"
)

// Login attempts to log in to the Twitter scraper service.
// It supports three modes of operation:
// 1. Basic login using just a username and password.
// 2. Login requiring an email confirmation, using a username, password, and email address.
// 3. Login with two-factor authentication, using a username, password, and 2FA code.
// Parameters:
// - scraper: A pointer to an instance of the twitterscraper.Scraper.
// - credentials: A variadic list of strings representing login credentials.
// The function expects either two strings (username, password) for basic login,
// or three strings (username, password, email/2FA code) for email confirmation or 2FA.
//
// Returns an error if login fails or if an invalid number of credentials is provided.
func Login(scraper *twitterscraper.Scraper, credentials ...string) error {
func NewScraper(account *TwitterAccount, cookieDir string) *Scraper {
scraper := &Scraper{Scraper: newTwitterScraper()}

if err := LoadCookies(scraper.Scraper, account, cookieDir); err == nil {
logrus.Debugf("Cookies loaded for user %s.", account.Username)
if scraper.IsLoggedIn() {
logrus.Debugf("Already logged in as %s.", account.Username)
return scraper
}
}

ShortSleep()

if err := scraper.Login(account.Username, account.Password, account.TwoFACode); err != nil {
logrus.WithError(err).Warnf("Login failed for %s", account.Username)
return nil
}

ShortSleep()

if err := SaveCookies(scraper.Scraper, account, cookieDir); err != nil {
logrus.WithError(err).Errorf("Failed to save cookies for %s", account.Username)
}

logrus.Debugf("Login successful for %s", account.Username)
return scraper
}

func (scraper *Scraper) Login(username, password string, twoFACode ...string) error {
var err error
switch len(credentials) {
case 2:
// Basic login with username and password.
err = scraper.Login(credentials[0], credentials[1])
case 3:
// The third parameter is used for either email confirmation or a 2FA code.
// This design assumes the Twitter scraper's Login method can contextually handle both cases.
err = scraper.Login(credentials[0], credentials[1], credentials[2])
default:
// Return an error if the number of provided credentials is neither 2 nor 3.
return fmt.Errorf("invalid number of login credentials provided")
if len(twoFACode) > 0 {
err = scraper.Scraper.Login(username, password, twoFACode[0])
} else {
err = scraper.Scraper.Login(username, password)
}
if err != nil {
return fmt.Errorf("%v", err)
return fmt.Errorf("login failed: %v", err)
}
return nil
}

func IsLoggedIn(scraper *twitterscraper.Scraper) bool {
return scraper.IsLoggedIn()
}

func Logout(scraper *twitterscraper.Scraper) error {
err := scraper.Logout()
if err != nil {
return fmt.Errorf("[-] Logout failed: %v", err)
func (scraper *Scraper) Logout() error {
if err := scraper.Scraper.Logout(); err != nil {
return fmt.Errorf("logout failed: %v", err)
}
return nil
}
85 changes: 85 additions & 0 deletions pkg/scrapers/twitter/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package twitter

import (
"fmt"
"os"
"strings"
"sync"

"github.com/joho/godotenv"
"github.com/masa-finance/masa-oracle/pkg/config"
"github.com/sirupsen/logrus"
)

var (
accountManager *TwitterAccountManager
once sync.Once
)

func initializeAccountManager() {
accounts := loadAccountsFromConfig()
accountManager = NewTwitterAccountManager(accounts)
}

func loadAccountsFromConfig() []*TwitterAccount {
err := godotenv.Load()
if err != nil {
logrus.Fatalf("error loading .env file: %v", err)
}

accountsEnv := os.Getenv("TWITTER_ACCOUNTS")
if accountsEnv == "" {
logrus.Fatal("TWITTER_ACCOUNTS not set in .env file")
}

return parseAccounts(strings.Split(accountsEnv, ","))
}

func parseAccounts(accountPairs []string) []*TwitterAccount {
return filterMap(accountPairs, func(pair string) (*TwitterAccount, bool) {
credentials := strings.Split(pair, ":")
if len(credentials) != 2 {
logrus.Warnf("invalid account credentials: %s", pair)
return nil, false
}
return &TwitterAccount{
Username: strings.TrimSpace(credentials[0]),
Password: strings.TrimSpace(credentials[1]),
}, true
})
}

func getAuthenticatedScraper() (*Scraper, *TwitterAccount, error) {
once.Do(initializeAccountManager)
baseDir := config.GetInstance().MasaDir

account := accountManager.GetNextAccount()
if account == nil {
return nil, nil, fmt.Errorf("all accounts are rate-limited")
}
scraper := NewScraper(account, baseDir)
if scraper == nil {
logrus.Errorf("Authentication failed for %s", account.Username)
return nil, account, fmt.Errorf("Twitter authentication failed for %s", account.Username)
}
return scraper, account, nil
}

func handleRateLimit(err error, account *TwitterAccount) bool {
if strings.Contains(err.Error(), "Rate limit exceeded") {
accountManager.MarkAccountRateLimited(account)
logrus.Warnf("rate limited: %s", account.Username)
return true
}
return false
}

func filterMap[T any, R any](slice []T, f func(T) (R, bool)) []R {
result := make([]R, 0, len(slice))
for _, v := range slice {
if r, ok := f(v); ok {
result = append(result, r)
}
}
return result
}
35 changes: 35 additions & 0 deletions pkg/scrapers/twitter/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package twitter

import (
"fmt"
"time"

"github.com/sirupsen/logrus"
)

const (
ShortSleepDuration = 20 * time.Millisecond
RateLimitDuration = time.Hour
MaxRetries = 3
)

func ShortSleep() {
time.Sleep(ShortSleepDuration)
}

func GetRateLimitDuration() time.Duration {
return RateLimitDuration
}

func Retry[T any](operation func() (T, error), maxAttempts int) (T, error) {
var zero T
for attempt := 1; attempt <= maxAttempts; attempt++ {
result, err := operation()
if err == nil {
return result, nil
}
logrus.Errorf("retry attempt %d failed: %v", attempt, err)
time.Sleep(time.Duration(attempt) * time.Second)
}
return zero, fmt.Errorf("operation failed after %d attempts", maxAttempts)
}
27 changes: 11 additions & 16 deletions pkg/scrapers/twitter/cookies.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,32 @@ import (
"fmt"
"net/http"
"os"
"path/filepath"

twitterscraper "github.com/masa-finance/masa-twitter-scraper"
)

func SaveCookies(scraper *twitterscraper.Scraper, filePath string) error {
func SaveCookies(scraper *twitterscraper.Scraper, account *TwitterAccount, baseDir string) error {
cookieFile := filepath.Join(baseDir, fmt.Sprintf("%s_twitter_cookies.json", account.Username))
cookies := scraper.GetCookies()
js, err := json.Marshal(cookies)
data, err := json.Marshal(cookies)
if err != nil {
return fmt.Errorf("error marshaling cookies: %v", err)
}
err = os.WriteFile(filePath, js, 0644)
if err != nil {
return fmt.Errorf("error saving cookies to file: %v", err)
}

// Load the saved cookies back into the scraper
if err := LoadCookies(scraper, filePath); err != nil {
return fmt.Errorf("error loading saved cookies: %v", err)
if err = os.WriteFile(cookieFile, data, 0644); err != nil {
return fmt.Errorf("error saving cookies: %v", err)
}

return nil
}

func LoadCookies(scraper *twitterscraper.Scraper, filePath string) error {
js, err := os.ReadFile(filePath)
func LoadCookies(scraper *twitterscraper.Scraper, account *TwitterAccount, baseDir string) error {
cookieFile := filepath.Join(baseDir, fmt.Sprintf("%s_twitter_cookies.json", account.Username))
data, err := os.ReadFile(cookieFile)
if err != nil {
return fmt.Errorf("error reading cookies from file: %v", err)
return fmt.Errorf("error reading cookies: %v", err)
}
var cookies []*http.Cookie
err = json.Unmarshal(js, &cookies)
if err != nil {
if err = json.Unmarshal(data, &cookies); err != nil {
return fmt.Errorf("error unmarshaling cookies: %v", err)
}
scraper.SetCookies(cookies)
Expand Down
40 changes: 15 additions & 25 deletions pkg/scrapers/twitter/followers.go
Original file line number Diff line number Diff line change
@@ -1,38 +1,28 @@
package twitter

import (
"encoding/json"
"fmt"

_ "github.com/lib/pq"
twitterscraper "github.com/masa-finance/masa-twitter-scraper"
"github.com/sirupsen/logrus"
)

// ScrapeFollowersForProfile scrapes the profile and tweets of a specific Twitter user.
// It takes the username as a parameter and returns the scraped profile information and an error if any.
func ScrapeFollowersForProfile(username string, count int) ([]twitterscraper.Legacy, error) {
scraper := auth()
return Retry(func() ([]twitterscraper.Legacy, error) {
scraper, account, err := getAuthenticatedScraper()
if err != nil {
return nil, err
}

if scraper == nil {
return nil, fmt.Errorf("there was an error authenticating with your Twitter credentials")
}
followingResponse, errString, _ := scraper.FetchFollowers(username, count, "")
if errString != "" {
if handleRateLimit(fmt.Errorf(errString), account) {
return nil, fmt.Errorf("rate limited")
}
logrus.Errorf("Error fetching followers: %v", errString)
return nil, fmt.Errorf("%v", errString)
}

followingResponse, errString, _ := scraper.FetchFollowers(username, count, "")
if errString != "" {
logrus.Printf("Error fetching profile: %v", errString)
return nil, fmt.Errorf("%v", errString)
}

// Marshal the followingResponse into a JSON string for logging
responseJSON, err := json.Marshal(followingResponse)
if err != nil {
// Log the error if the marshaling fails
logrus.Errorf("[-] Error marshaling followingResponse: %v", err)
} else {
// Log the JSON string of followingResponse
logrus.Debugf("Following response: %s", responseJSON)
}

return followingResponse, nil
return followingResponse, nil
}, MaxRetries)
}
23 changes: 23 additions & 0 deletions pkg/scrapers/twitter/profile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package twitter

import (
twitterscraper "github.com/masa-finance/masa-twitter-scraper"
)

func ScrapeTweetsProfile(username string) (twitterscraper.Profile, error) {
return Retry(func() (twitterscraper.Profile, error) {
scraper, account, err := getAuthenticatedScraper()
if err != nil {
return twitterscraper.Profile{}, err
}

profile, err := scraper.GetProfile(username)
if err != nil {
if handleRateLimit(err, account) {
return twitterscraper.Profile{}, err
}
return twitterscraper.Profile{}, err
}
return profile, nil
}, MaxRetries)
}
Loading
Loading