Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add prototype TwitterBatchScraper #202

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ $(BUILD_STAMP): \
lib/scrapers/hackernews.js \
lib/scrapers/reddit.js \
lib/scrapers/twitter.js \
lib/scrapers/twitter_batch.js \
lib/scrapers/generic_social.js \
lib/team.js \
lib/team_hidden.js \
Expand Down
2 changes: 1 addition & 1 deletion lib/main.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

229 changes: 229 additions & 0 deletions lib/scrapers/twitter_batch.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/main.iced
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ mods = [
require('./pgp_update')
require('./announcement')
require('./scrapers/twitter')
require('./scrapers/twitter_batch')
require('./scrapers/facebook')
require('./scrapers/base')
require('./scrapers/github')
Expand Down
102 changes: 102 additions & 0 deletions src/scrapers/twitter_batch.iced
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{sncmp,BaseScraper,BaseBearerToken} = require './base'
{TwitterScraper} = require './twitter'
{make_ids} = require '../base'
{constants} = require '../constants'
{v_codes} = constants
{decode_sig} = require('kbpgp').ukm
{Lock} = require '../util'
urlmod = require 'url'
schema = require '../schema3'

#================================================================================

ws_normalize = (x) ->
v = x.split(/[\t\r\n ]+/)
v.shift() if v.length and v[0].length is 0
v.pop() if v.length and v[-1...][0].length is 0
v.join ' '

#================================================================================

exports.TwitterBatchScraper = class TwitterBatchScraper extends TwitterScraper
constructor: (opts) ->
@_tweet_cache = opts.tweet_cache
@cache_refresh_interval = opts.cache_refresh_interval
super opts

_hunt_batch : (cb) ->
query =
query : "\"Verifying myself\" \"Keybase.io\""
expansions: "author_screen_name"
"user.fields": "url,username"
"tweet.fields": "created_at"
max_results: 60
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be problematic during initial run if there were more than new 60 proofs last week.

if since_id = @_tweet_cache.last_id
# Do not fetch tweets that were already cached.
query.since_id = since_id

u = urlmod.format {
host : "api.twitter.com"
protocol : "https:"
pathname : "/2/tweets/search/recent"
query
}

await @_get_body_api { url : u }, defer err, rc, json
@log "| search index #{u} -> #{rc}"
if rc isnt v_codes.OK then #noop
else if not json? or (json.length is 0) then rc = v_codes.EMPTY_JSON
else if not json.data? then rc = v_codes.INVALID_JSON
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not invalid json, data is not present when there are no results

else
console.log json.data
for {id, created_at, username, text}, i in json.data
created_at = new Date(created_at)
unless isFinite(created_at)
@log "got invalid date in tweet JSON id: #{id}, created_at: #{tweet.created_at}"
continue
@log "ingesting tweet: id: #{id}, username: #{username}, text: \"#{text}\""
@_tweet_cache.inform { id, created_at, username, text }

cb null, v_codes.OK

hunt2 : ({username, name, proof_text_check}, cb) ->
# See if we should refresh cache.
await @_tweet_cache.lock.acquire defer()
err = null
now = Math.floor(Date.now() / 1000)
if now - @_tweet_cache.fetched_at > @cache_refresh_interval
@_tweet_cache.fetched_at = now
await @_hunt_batch defer err, rc
if not err and rc isnt v_codes.OK
err = new Error("rc: #{rc}")
@_tweet_cache.lock.release()
if err
@logl "error", "error when hunting batch: #{err.toString()}"
return cb err

out = {}
rc = v_codes.NOT_FOUND
current_tweet = @_tweet_cache.tweets.get(username)
if current_tweet and (@find_sig_in_tweet { inside : current_tweet.text, proof_text_check }) is v_codes.OK
rc = v_codes.OK
remote_id = current_tweet.id
api_url = human_url = @_id_to_url username, remote_id
out = { remote_id, api_url, human_url }
out.rc = rc
cb err, out

#================================================================================

exports.TweetCache = class TweetCache
constructor : () ->
@tweets = new Map() # username -> tweet
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bad idea, should keep multiple tweets per twitter user and pick the correct one for given hunt

@last_id = null
@fetched_at = 0
@lock = new Lock()

inform : ({id, created_at, username, text}) ->
current = @tweets.get(username)
if current and current.created_at >= created_at
# We already have this tweet or more recent tweet for this user.
return
@tweets.set(username, { id, created_at, text })