Selenium入门

Date: 2019/10/16 Categories: 工作 Tags: selenium 爬虫



Firefox

使用了firefox61+geckodriver, 安装pyvirtualdisplay需要依赖yum install xorg-x11-utils

from pyvirtualdisplay import Display
display = Display(backend='xvfb', visible=0, size=(1920, 1080)).start()

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

capabilities = webdriver.DesiredCapabilities().FIREFOX
capabilities["marionette"] = True
profile = webdriver.FirefoxProfile()

profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", "devnet-proxy.oa.com")
profile.set_preference("network.proxy.http_port", 8080)
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.ssl", "devnet-proxy.oa.com")
profile.set_preference("network.proxy.ssl_port", 8080)
profile.update_preferences()

driver = webdriver.Firefox(capabilities=capabilities, firefox_profile=profile)

driver.get('https://www.baidu.com/s?wd=音乐电影')
content = driver.page_source.encode("utf-8")
html = content.decode('utf8')

Chrome

from pyvirtualdisplay import Display
display = Display(backend='xvfb', visible=0, size=(1920, 1080)).start()

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument('--proxy-server=devnet-proxy.oa.com:8080')

driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.baidu.com/s?wd=音乐电影')
content = driver.page_source.encode("utf-8")
html = content.decode('utf8')
print('exactqa' in html)

# 使用xpath提取html element
# 提取html tag的innerHTML
element = driver.find_element_by_xpath('//span[@class="opui-page-next OP_LOG_BTN"]')
innerHTML = element.get_attribute('innerHTML')

# 打印html片段
import lxml
node = lxml.html.fromstring(innerHTML)
print(lxml.etree.tostring(node, pretty_print=True, encoding='unicode'))

chrome在linux上有一些依赖, 可以查看https://github.com/GoogleChrome/puppeteer/blob/master/docs/troubleshooting.md#chrome-headless-doesnt-launch-on-unix