2018/04/28
的全量原始querylog共5287122 条query,未去重 234fa1f16c2e1af47e5f2c0fbe97844fa29f5b58
|from geventhttpclient import HTTPClient
import ujson
from itertools import islice
import gzip
import requests, io
from itertools import islice
from tqdm import tqdm_notebook as tqdm
import gevent.pool
import time
import numpy as np
import matplotlib.pyplot as plot
CONCCURRENCY = 48
url = 'http://10.229.146.230:1234'
http = HTTPClient.from_url(url, concurrency=CONCCURRENCY)
def classify(query):
r = http.post('/api?truncate=0.7', body=ujson.dumps({'query': query}))
obj = ujson.load(r)
if any(i['score'] >= 0.9 for i in obj):
return 2
if any(i['score'] >= 0.7 for i in obj):
return 1
else:
return 0
latency = []
def cb(query):
start = time.time()
x = classify(query)
end = time.time()
latency.append((end-start)*1000) #ms
results[x] += 1
x=!wc -l /data/querylog_20190428.txt
size = int(x[0].split()[0])
print 'size:', size
pool = gevent.pool.Pool(CONCCURRENCY)
results = {2:0, 1:0, 0:0}
src = open('/data/querylog_20190428.txt')
for line in tqdm(src, total=size):
query = line.strip()
pool.wait_available()
pool.spawn(cb, query)
total = sum(results.values())
print 'hit: ', results[2]/float(total)
print 'subhit:', results[1]/float(total)
latency = np.array(sorted(latency))
np.percentile(latency, [1, 5, 25, 50, 75, 95, 99])
1.24%
11.22%
结论: 可能提升的召回的上界为 11.22/1.24
= 900%