利用google bing图片搜索构建测试数据集 #11

wanghaisheng · 2016-06-14T03:03:28Z

No description provided.

wanghaisheng · 2016-06-14T03:08:31Z

https://github.com/syurchi/google-image-scraper

http://stackoverflow.com/questions/35242151/batch-download-google-images-with-tags
http://stackoverflow.com/questions/4082966/what-are-the-alternatives-now-that-the-google-web-search-api-has-been-deprecated
http://stackoverflow.com/questions/25133865/download-images-from-google-image-search-python
http://stackoverflow.com/questions/32035973/in-python-is-there-a-way-i-can-download-all-some-the-image-files-e-g-jpg-png?rq=1

wanghaisheng · 2016-06-14T03:16:42Z

# DESCRIPTION
#    This is a Google Image batch download tool, takes search query as input,
#    resulting a folder containing Google Image search results( usually couple of images).
# SYNOPSIS
#    ./GoogleImageSearch.sh QUERY
# EXAMPLE
#    ./GoogleImageSearch.sh 'Linkin Park'

echo 'searching Google Image for' $1 '...';
#replace space with '+', ex."Linkin Park" -> "Linkin+Park"
query=$(echo $1 | sed 's/ /+/g');
#echo $query
url="http://www.google.com.hk/search?q=$query&tbm=isch&sout=1&tbs=isz:ex,iszw:600,iszh:600";
echo $url;

#Step1: use w3m to download wegpage source file
w3m -dump_source $url >GoogleImageSearch.html;

#Step2: fetch imgurl from webpage source file
  #insert newline in front of where string "imgurl" appears
  awk '{gsub(/imgurl/,"\nimgurl");print}' < GoogleImageSearch.html > newline_imgurl;
  #insert newline at the end of where string "jpg" or "png" appears
  awk '{gsub(/jpg/,"jpg\n");print}' < newline_imgurl > newline_jpg;
  awk '{gsub(/png/,"png\n");print}' < newline_jpg > newline_png;
  #grep imgurls
  grep -E "(imgurl=http:[-/.[:alnum:]]*jpg|imgurl=http:[-/.[:alnum:]]*png)" newline_png > remove_imgurl;
  #remove string "imgurl=", left pure url list
  awk '{gsub(/imgurl=/,"");print}' < remove_imgurl > urlList;
  #clear up
  rm newline_imgurl newline_jpg newline_png remove_imgurl;
  # to examine url list: remove '#' below
  #vi urlList;

#Step3: download image files(Input:urlList;Retry:2 times;Output:Fold $query)
wget -i urlList -t 2 -P $query;

wanghaisheng · 2016-06-14T03:17:32Z

#!/usr/bin/env python


'''
Query on GoogleImageSearch and install resulted images by scraping.

To use this script install mechanize and BeautifulSoup packages as
easy_install mechanize
easy_install Beautiful

Example Run:
installQueriedGoogleImages('spotty')

Eren Golge [email protected] - www.erengolge.com - 17 April 2013
'''

import json
import pdb
import urllib
import mechanize 
import cookielib
import re
import sys
import os
from BeautifulSoup import BeautifulSoup

def installQueriedGoogleImages(query):
    br = mechanize.Browser()
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # Want debugging messages?
    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)

    # User-Agent (this is cheating, ok?)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    main_url = 'http://www.bing.com/images/search?q='+query
    r = br.open(main_url)
    counter = 1
    for i in range(6):
        html = r.read()
        soup = BeautifulSoup(html)
        divs = soup.findAll('div',{'class':"dg_u"})
        for div in divs:
            links = div.findAll('a')
            link = links[0]
            text = link['m']
            img_link = re.search('imgurl:"([^"]+)', text).group(1)
            print 'Downloading image %d-%s -...\n'%(counter, img_link)
            try:
#                 pdb.set_trace()
                ext = img_link[-4:]
                # urllib.urlretrieve(img_link, query+'/image'+str(counter)+ext)
                download_photo(img_link, query+'/image'+str(counter)+ext)
            except IOError:
                print 'image %d cannot be downloaded because of server error!...'%counter
            except UnicodeError:
                print 'image %d cannot be downloaded because of naming of website!...'%counter
            counter += 1
            print('df')

#         r = br.open(link)

def download_photo(img_url, filename):
    try:
        image_on_web = urllib.urlopen(img_url)
        if image_on_web.headers.maintype == 'image':
            buf = image_on_web.read()
            downloaded_image = file(filename, "wb")
            downloaded_image.write(buf)
            downloaded_image.close()
            image_on_web.close()
        else:
            return False    
    except:
        return False
    return True

if __name__ == '__main__':
    iteration_num = len(sys.argv)-1
    for i in range(iteration_num):
        color= sys.argv[i+1]
        if not os.path.exists(color):
            os.makedirs(color)
        installQueriedGoogleImages(color)

wanghaisheng · 2016-06-14T03:17:54Z

#conding:utf-8

import sys
import os
import urllib
import urllib2
import json
import requests

KEY = '<Your Bing Developer Key>'
OUTPUT = '/images/bing/'
MAX = 100
count = 1

def bing_search(query, directory, skip):
        global count

        bing_url = 'https://api.datamarket.azure.com/Bing/Search/v1/Image'
        print 'search count: ' + str(count) + ', url: ' + bing_url + ', skip: ' + str(skip)
        pm = urllib2.HTTPPasswordMgrWithDefaultRealm()
        pm.add_password(None, bing_url, KEY, KEY)

        handler = urllib2.HTTPBasicAuthHandler(pm)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
        if skip > 0:
            params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'",  '$skip': skip ,'$format': 'json'})
        else:       
            params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'", '$format': 'json'})
        response = urllib2.urlopen(bing_url+'?'+params)
        data = json.loads(response.read())

        results = data['d']['results']

        for item in results:
                if count > MAX:
                        print 'finish. count: ' + str(MAX)
                        return

                image_url = item['MediaUrl']
                root,ext = os.path.splitext(image_url)
                if ext.lower() == '.jpg':
                        print image_url,
                        try:
                                r = requests.get(image_url)
                                fname = OUTPUT + directory + "/bing%04d.jpg" % count
                                f = open(fname, 'wb')
                                f.write(r.content)
                                f.close()
                                print "...save", fname
                        except:
                                print "error", fname
                        count += 1

        bing_search(query, directory, count)

if __name__ == '__main__':
        argvs = sys.argv
        argc = len(argvs)
        if(argc != 3):
                print 'Usage: python %s query directory' % argvs[0]
                quit()
        query = argvs[1]
        directory = argvs[2]
        print 'get bing image: %s ' % query
        bing_search(query, directory, 0)

wanghaisheng · 2016-06-14T03:31:40Z

Bulk Bing Image Downloader

Bulk Bing Image Downloader (BBID) is downloader which:

downloads full-size images from bing image search results
is multithreaded
is crossplatform
bypasses bing API
has option to disable adult content filtering
is written in python 3.
uses SSL connection

Usage

chmod +x bbid.py
./bbid.py [-h] [-s SEARCH_STRING] [-f SEARCH_FILE] [-o OUTPUT] [--filter] [--no-filter]

Example

./bbid.py -s earth

#!/usr/bin/env python3
import os, sys, urllib.request, re, threading, posixpath, urllib.parse, argparse, atexit, random, socket, time, hashlib, pickle, signal, subprocess

#config
output_dir = './bing' #default output dir
adult_filter = True #Do not disable adult filter by default
pool_sema = threading.BoundedSemaphore(value = 20) #max number of download threads
bingcount = 35 #default bing paging
socket.setdefaulttimeout(2)

in_progress = []
tried_urls = []
finished_keywords=[]
failed_urls = []
image_md5s = {}
urlopenheader={ 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0'}
def download(url,output_dir,retry=False):
    global tried_urls, failed_urls
    url_hash=hashlib.sha224(url.encode('utf-8')).digest()
    if url_hash in tried_urls:
        return
    pool_sema.acquire() 
    path = urllib.parse.urlsplit(url).path
    filename = posixpath.basename(path)
    if len(filename)>40:
        filename=filename[:36]+filename[-4:]
    while os.path.exists(output_dir + '/' + filename):
        filename = str(random.randint(0,100)) + filename
    in_progress.append(filename)
    try:
        request=urllib.request.Request(url,None,urlopenheader)
        image=urllib.request.urlopen(request).read()
        if len(image)==0:
            print('no image')

        md5 = hashlib.md5()
        md5.update(image)
        md5_key = md5.hexdigest()
        if md5_key in image_md5s:
            print('FAIL Image is a duplicate of ' + image_md5s[md5_key] + ', not saving ' + filename)
            return

        image_md5s[md5_key] = filename

        imagefile=open(output_dir + '/' + filename,'wb')
        imagefile.write(image)
        imagefile.close()
        in_progress.remove(filename)
        if retry:
            print('Retry OK '+ filename)
        else:
            print("OK " + filename)
        tried_urls.append(url_hash)
    except Exception as e:
        if retry:
            print('Retry Fail ' + filename)
        else:
            print("FAIL " + filename)
            failed_urls.append((url, output_dir))
    pool_sema.release()

def removeNotFinished():
    for filename in in_progress:
        try:
            os.remove(output_dir + '/' + filename)
        except FileNotFoundError:
            pass

def fetch_images_from_keyword(keyword,output_dir):
    current = 1
    last = ''
    while True:
        request_url='https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(keyword) + '&async=content&first=' + str(current) + '&adlt=' + adlt
        request=urllib.request.Request(request_url,None,headers=urlopenheader)
        response=urllib.request.urlopen(request)
        html = response.read().decode('utf8')
        links = re.findall('imgurl:&quot;(.*?)&quot;',html)
        try:
            if links[-1] == last:
                break
            last = links[-1]
            current += bingcount
            for link in links:
                t = threading.Thread(target = download,args = (link,output_dir))
                t.start()
        except IndexError:
            print('No search results for "{0}"'.format(keyword))
            return False
        time.sleep(0.1)
    return True

def backup_history(*args):
    download_history=open(output_dir + '/download_history.pickle','wb')
    pickle.dump(tried_urls,download_history)
    pickle.dump(finished_keywords, download_history)
    pickle.dump(image_md5s, download_history)
    download_history.close()
    print('history_dumped')
    if args:
        exit(0)

if __name__ == "__main__":
    atexit.register(removeNotFinished)
    parser = argparse.ArgumentParser(description = 'Bing image bulk downloader')
    parser.add_argument('-s', '--search-string', help = 'Keyword to search', required = False)
    parser.add_argument('-f', '--search-file', help = 'Path to a file containing search strings line by line', required = False)
    parser.add_argument('-o', '--output', help = 'Output directory', required = False)
    parser.add_argument('--filter', help = 'Enable adult filter', action = 'store_true', required = False)
    parser.add_argument('--no-filter', help=  'Disable adult filter', action = 'store_true', required = False)
    args = parser.parse_args()
    if (not args.search_string) and (not args.search_file):
        parser.error('Provide Either search string or path to file containing search strings')
    if args.output:
        output_dir = args.output
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_dir_origin = output_dir
    signal.signal(signal.SIGINT, backup_history)
    try:
        download_history=open(output_dir + '/download_history.pickle','rb')
        tried_urls=pickle.load(download_history)
        finished_keywords=pickle.load(download_history)
        image_md5s=pickle.load(download_history)
        download_history.close()
    except (OSError, IOError):
        tried_urls=[]
    if adult_filter:
        adlt = ''
    else:
        adlt = 'off'
    if args.no_filter:
        adlt = 'off'
    elif args.filter:
        adlt = ''
    if args.search_string:
        keyword = args.search_string
        fetch_images_from_keyword(args.search_string,output_dir)
    elif args.search_file:
        try:
            inputFile=open(args.search_file)
        except (OSError, IOError):
            print("Couldn't open file {}".format(args.search_file))
            exit(1)
        for keyword in inputFile.readlines():
            keyword_hash=hashlib.sha224(keyword.strip().encode('utf-8')).digest()
            if keyword_hash in finished_keywords:
                print('"{0}" Already downloaded'.format(keyword.strip()))
                continue
            output_dir = output_dir_origin + '/' + keyword.strip().replace(' ','_')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if fetch_images_from_keyword(keyword,output_dir):
                finished_keywords.append(keyword_hash)
                for failed_url in failed_urls:
                    t = threading.Thread(target = download,args = (failed_url[0],failed_url[1],True))
                    t.start()
                failed_urls=[]
            backup_history()
        inputFile.close()

wanghaisheng added the project label Nov 7, 2016

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

利用google bing图片搜索构建测试数据集 #11

利用google bing图片搜索构建测试数据集 #11

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016 •

edited

Loading

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016

利用google bing图片搜索构建测试数据集 #11

利用google bing图片搜索构建测试数据集 #11

Comments

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016 • edited Loading

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016

wanghaisheng commented Jun 14, 2016

Bulk Bing Image Downloader

Usage

Example

wanghaisheng commented Jun 14, 2016 •

edited

Loading