gevent多个后端

Date: 2019/05/14 Categories: 工作 Tags: 爬虫



第一版

# coding: utf-8

from gevent.monkey import patch_all; patch_all()
import gevent
from gevent.pool import Pool
from gevent.event import Event, AsyncResult
from urllib3 import PoolManager
from gevent.pywsgi import WSGIServer
import time
import ujson




import pandas as pd




from urllib3 import HTTPConnectionPool
http = HTTPConnectionPool('10.229.146.230', port=1234, maxsize=10)

def ask(query, cb):
    start = time.time()
    #r = requests.post('http://10.229.146.230:1234/api', json={'query':query})
    #result.set(r.json())
    r = http.request('POST', '/api', body=ujson.dumps({'query':query}))
    print 'finish query: {}, duration: {}ms'.format(query, 1000*(time.time() - start))
    cb(ujson.loads(r.data))

pool = Pool()
def query(*args):
    result = AsyncResult()
    setted = [False]
    #kill = lambda : pool.kill()

    def callback(x):
        if not setted[0]:
            setted[0] = True
            result.set(x)
            pool.kill()
    for i in args:
        pool.spawn(ask, i, callback)
    return result.wait()


import json
start = time.time()
x =  query('世界最高山峰', '中国最长的河流', '徐峥是谁')
print 'duration: {}ms'.format(1000*(time.time() - start))
pool.join()
print pd.DataFrame(x[:1])

第二版

# coding: utf-8

import gevent
from gevent.pool import Pool
from gevent.event import AsyncResult
import time
import json
from geventhttpclient import HTTPClient


KILL = [True]
pool = Pool()

def ask(url, payload, cb, pool, jobs):
    start = time.time()

    http = HTTPClient.from_url(url, concurrency=1)
    r = http.post(url, body=payload)
    content = r.read()
    if KILL[0]:
        for k, job in jobs.items():
            if k != url:
                pool.killone(job)
    cb(json.loads(content))
    print 'finish query: {}, duration: {}ms'.format(url, 1000*(time.time() - start))

pool = Pool()
def query(*args):
    result = AsyncResult()
    setted = [False]

    def callback(x):
        if not setted[0]:
            setted[0] = True
            result.set(x)

    jobs = {}
    for url, payload in args:
        payload = json.dumps(payload)
        job = pool.spawn(ask, url, payload, callback, pool, jobs)
        jobs[url] = job
    return result.wait()


start = time.time()

queries = [
    ('http://10.229.146.230:1234/api', {'query': '徐峥是谁'}),
    ('http://100.77.14.21:50005/api', {"Type":"LIST","keys":[{"name":"英达","prop_name":"儿子","type":"sp\tname","type_name":"人物类_人物"}]} ),
]

x =  query(*queries)
print 'duration: {}ms'.format(1000*(time.time() - start))
print x
pool.join()