跳至主要内容

博文

目前显示的是 2020的博文

如何爬取网页?可运行的python脚本

#!/usr/bin/python #-*-coding:utf-8-*-       #指定编码格式,python默认unicode编码 import json,os,sys from bs4 import BeautifulSoup from urllib.request import urlopen import re import time from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.desired_capabilities import DesiredCapabilities def first(): binary = r'C:\Program Files\Mozilla Firefox32\firefox.exe' options = Options() options.set_headless(headless=True) options.binary = binary cap = DesiredCapabilities().FIREFOX cap["marionette"] = True #optional fp = webdriver.FirefoxProfile() fp.set_preference("permissions.default.stylesheet" ,2) fp.set_preference("permissions.default.image" ,2) driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\\Program Files\\geckodriver-v0.26.0-win64\\geckodriver.exe") return driver def readlinkfile(sourcename='tangshisanbai