Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

利用google bing图片搜索构建测试数据集 #11

Open
wanghaisheng opened this issue Jun 14, 2016 · 5 comments
Open

利用google bing图片搜索构建测试数据集 #11

wanghaisheng opened this issue Jun 14, 2016 · 5 comments
Labels

Comments

@wanghaisheng
Copy link
Contributor

No description provided.

@wanghaisheng
Copy link
Contributor Author

# DESCRIPTION
#    This is a Google Image batch download tool, takes search query as input,
#    resulting a folder containing Google Image search results( usually couple of images).
# SYNOPSIS
#    ./GoogleImageSearch.sh QUERY
# EXAMPLE
#    ./GoogleImageSearch.sh 'Linkin Park'

echo 'searching Google Image for' $1 '...';
#replace space with '+', ex."Linkin Park" -> "Linkin+Park"
query=$(echo $1 | sed 's/ /+/g');
#echo $query
url="http://www.google.com.hk/search?q=$query&tbm=isch&sout=1&tbs=isz:ex,iszw:600,iszh:600";
echo $url;

#Step1: use w3m to download wegpage source file
w3m -dump_source $url >GoogleImageSearch.html;

#Step2: fetch imgurl from webpage source file
  #insert newline in front of where string "imgurl" appears
  awk '{gsub(/imgurl/,"\nimgurl");print}' < GoogleImageSearch.html > newline_imgurl;
  #insert newline at the end of where string "jpg" or "png" appears
  awk '{gsub(/jpg/,"jpg\n");print}' < newline_imgurl > newline_jpg;
  awk '{gsub(/png/,"png\n");print}' < newline_jpg > newline_png;
  #grep imgurls
  grep -E "(imgurl=http:[-/.[:alnum:]]*jpg|imgurl=http:[-/.[:alnum:]]*png)" newline_png > remove_imgurl;
  #remove string "imgurl=", left pure url list
  awk '{gsub(/imgurl=/,"");print}' < remove_imgurl > urlList;
  #clear up
  rm newline_imgurl newline_jpg newline_png remove_imgurl;
  # to examine url list: remove '#' below
  #vi urlList;

#Step3: download image files(Input:urlList;Retry:2 times;Output:Fold $query)
wget -i urlList -t 2 -P $query;

@wanghaisheng
Copy link
Contributor Author

#!/usr/bin/env python


'''
Query on GoogleImageSearch and install resulted images by scraping.

To use this script install mechanize and BeautifulSoup packages as
easy_install mechanize
easy_install Beautiful

Example Run:
installQueriedGoogleImages('spotty')

Eren Golge [email protected] - www.erengolge.com - 17 April 2013
'''

import json
import pdb
import urllib
import mechanize 
import cookielib
import re
import sys
import os
from BeautifulSoup import BeautifulSoup

def installQueriedGoogleImages(query):
    br = mechanize.Browser()
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # Want debugging messages?
    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)

    # User-Agent (this is cheating, ok?)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    main_url = 'http://www.bing.com/images/search?q='+query
    r = br.open(main_url)
    counter = 1
    for i in range(6):
        html = r.read()
        soup = BeautifulSoup(html)
        divs = soup.findAll('div',{'class':"dg_u"})
        for div in divs:
            links = div.findAll('a')
            link = links[0]
            text = link['m']
            img_link = re.search('imgurl:"([^"]+)', text).group(1)
            print 'Downloading image %d-%s -...\n'%(counter, img_link)
            try:
#                 pdb.set_trace()
                ext = img_link[-4:]
                # urllib.urlretrieve(img_link, query+'/image'+str(counter)+ext)
                download_photo(img_link, query+'/image'+str(counter)+ext)
            except IOError:
                print 'image %d cannot be downloaded because of server error!...'%counter
            except UnicodeError:
                print 'image %d cannot be downloaded because of naming of website!...'%counter
            counter += 1
            print('df')

#         r = br.open(link)

def download_photo(img_url, filename):
    try:
        image_on_web = urllib.urlopen(img_url)
        if image_on_web.headers.maintype == 'image':
            buf = image_on_web.read()
            downloaded_image = file(filename, "wb")
            downloaded_image.write(buf)
            downloaded_image.close()
            image_on_web.close()
        else:
            return False    
    except:
        return False
    return True

if __name__ == '__main__':
    iteration_num = len(sys.argv)-1
    for i in range(iteration_num):
        color= sys.argv[i+1]
        if not os.path.exists(color):
            os.makedirs(color)
        installQueriedGoogleImages(color)

@wanghaisheng
Copy link
Contributor Author

#conding:utf-8

import sys
import os
import urllib
import urllib2
import json
import requests

KEY = '<Your Bing Developer Key>'
OUTPUT = '/images/bing/'
MAX = 100
count = 1

def bing_search(query, directory, skip):
        global count

        bing_url = 'https://api.datamarket.azure.com/Bing/Search/v1/Image'
        print 'search count: ' + str(count) + ', url: ' + bing_url + ', skip: ' + str(skip)
        pm = urllib2.HTTPPasswordMgrWithDefaultRealm()
        pm.add_password(None, bing_url, KEY, KEY)

        handler = urllib2.HTTPBasicAuthHandler(pm)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
        if skip > 0:
            params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'",  '$skip': skip ,'$format': 'json'})
        else:       
            params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'", '$format': 'json'})
        response = urllib2.urlopen(bing_url+'?'+params)
        data = json.loads(response.read())

        results = data['d']['results']

        for item in results:
                if count > MAX:
                        print 'finish. count: ' + str(MAX)
                        return

                image_url = item['MediaUrl']
                root,ext = os.path.splitext(image_url)
                if ext.lower() == '.jpg':
                        print image_url,
                        try:
                                r = requests.get(image_url)
                                fname = OUTPUT + directory + "/bing%04d.jpg" % count
                                f = open(fname, 'wb')
                                f.write(r.content)
                                f.close()
                                print "...save", fname
                        except:
                                print "error", fname
                        count += 1

        bing_search(query, directory, count)

if __name__ == '__main__':
        argvs = sys.argv
        argc = len(argvs)
        if(argc != 3):
                print 'Usage: python %s query directory' % argvs[0]
                quit()
        query = argvs[1]
        directory = argvs[2]
        print 'get bing image: %s ' % query
        bing_search(query, directory, 0)

@wanghaisheng
Copy link
Contributor Author

Bulk Bing Image Downloader

Bulk Bing Image Downloader (BBID) is downloader which:

  • downloads full-size images from bing image search results
  • is multithreaded
  • is crossplatform
  • bypasses bing API
  • has option to disable adult content filtering
  • is written in python 3.
  • uses SSL connection

Usage

chmod +x bbid.py
./bbid.py [-h] [-s SEARCH_STRING] [-f SEARCH_FILE] [-o OUTPUT] [--filter] [--no-filter]

Example

./bbid.py -s earth

#!/usr/bin/env python3
import os, sys, urllib.request, re, threading, posixpath, urllib.parse, argparse, atexit, random, socket, time, hashlib, pickle, signal, subprocess

#config
output_dir = './bing' #default output dir
adult_filter = True #Do not disable adult filter by default
pool_sema = threading.BoundedSemaphore(value = 20) #max number of download threads
bingcount = 35 #default bing paging
socket.setdefaulttimeout(2)

in_progress = []
tried_urls = []
finished_keywords=[]
failed_urls = []
image_md5s = {}
urlopenheader={ 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0'}
def download(url,output_dir,retry=False):
    global tried_urls, failed_urls
    url_hash=hashlib.sha224(url.encode('utf-8')).digest()
    if url_hash in tried_urls:
        return
    pool_sema.acquire() 
    path = urllib.parse.urlsplit(url).path
    filename = posixpath.basename(path)
    if len(filename)>40:
        filename=filename[:36]+filename[-4:]
    while os.path.exists(output_dir + '/' + filename):
        filename = str(random.randint(0,100)) + filename
    in_progress.append(filename)
    try:
        request=urllib.request.Request(url,None,urlopenheader)
        image=urllib.request.urlopen(request).read()
        if len(image)==0:
            print('no image')

        md5 = hashlib.md5()
        md5.update(image)
        md5_key = md5.hexdigest()
        if md5_key in image_md5s:
            print('FAIL Image is a duplicate of ' + image_md5s[md5_key] + ', not saving ' + filename)
            return

        image_md5s[md5_key] = filename

        imagefile=open(output_dir + '/' + filename,'wb')
        imagefile.write(image)
        imagefile.close()
        in_progress.remove(filename)
        if retry:
            print('Retry OK '+ filename)
        else:
            print("OK " + filename)
        tried_urls.append(url_hash)
    except Exception as e:
        if retry:
            print('Retry Fail ' + filename)
        else:
            print("FAIL " + filename)
            failed_urls.append((url, output_dir))
    pool_sema.release()

def removeNotFinished():
    for filename in in_progress:
        try:
            os.remove(output_dir + '/' + filename)
        except FileNotFoundError:
            pass

def fetch_images_from_keyword(keyword,output_dir):
    current = 1
    last = ''
    while True:
        request_url='https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(keyword) + '&async=content&first=' + str(current) + '&adlt=' + adlt
        request=urllib.request.Request(request_url,None,headers=urlopenheader)
        response=urllib.request.urlopen(request)
        html = response.read().decode('utf8')
        links = re.findall('imgurl:&quot;(.*?)&quot;',html)
        try:
            if links[-1] == last:
                break
            last = links[-1]
            current += bingcount
            for link in links:
                t = threading.Thread(target = download,args = (link,output_dir))
                t.start()
        except IndexError:
            print('No search results for "{0}"'.format(keyword))
            return False
        time.sleep(0.1)
    return True

def backup_history(*args):
    download_history=open(output_dir + '/download_history.pickle','wb')
    pickle.dump(tried_urls,download_history)
    pickle.dump(finished_keywords, download_history)
    pickle.dump(image_md5s, download_history)
    download_history.close()
    print('history_dumped')
    if args:
        exit(0)

if __name__ == "__main__":
    atexit.register(removeNotFinished)
    parser = argparse.ArgumentParser(description = 'Bing image bulk downloader')
    parser.add_argument('-s', '--search-string', help = 'Keyword to search', required = False)
    parser.add_argument('-f', '--search-file', help = 'Path to a file containing search strings line by line', required = False)
    parser.add_argument('-o', '--output', help = 'Output directory', required = False)
    parser.add_argument('--filter', help = 'Enable adult filter', action = 'store_true', required = False)
    parser.add_argument('--no-filter', help=  'Disable adult filter', action = 'store_true', required = False)
    args = parser.parse_args()
    if (not args.search_string) and (not args.search_file):
        parser.error('Provide Either search string or path to file containing search strings')
    if args.output:
        output_dir = args.output
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_dir_origin = output_dir
    signal.signal(signal.SIGINT, backup_history)
    try:
        download_history=open(output_dir + '/download_history.pickle','rb')
        tried_urls=pickle.load(download_history)
        finished_keywords=pickle.load(download_history)
        image_md5s=pickle.load(download_history)
        download_history.close()
    except (OSError, IOError):
        tried_urls=[]
    if adult_filter:
        adlt = ''
    else:
        adlt = 'off'
    if args.no_filter:
        adlt = 'off'
    elif args.filter:
        adlt = ''
    if args.search_string:
        keyword = args.search_string
        fetch_images_from_keyword(args.search_string,output_dir)
    elif args.search_file:
        try:
            inputFile=open(args.search_file)
        except (OSError, IOError):
            print("Couldn't open file {}".format(args.search_file))
            exit(1)
        for keyword in inputFile.readlines():
            keyword_hash=hashlib.sha224(keyword.strip().encode('utf-8')).digest()
            if keyword_hash in finished_keywords:
                print('"{0}" Already downloaded'.format(keyword.strip()))
                continue
            output_dir = output_dir_origin + '/' + keyword.strip().replace(' ','_')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if fetch_images_from_keyword(keyword,output_dir):
                finished_keywords.append(keyword_hash)
                for failed_url in failed_urls:
                    t = threading.Thread(target = download,args = (failed_url[0],failed_url[1],True))
                    t.start()
                failed_urls=[]
            backup_history()
        inputFile.close()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant