如何爬取网页？可运行的python脚本

#!/usr/bin/python
#-*-coding:utf-8-*- #指定编码格式，python默认unicode编码

import json,os,sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

def first():
binary = r'C:\Program Files\Mozilla Firefox32\firefox.exe'
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True #optional

fp = webdriver.FirefoxProfile()
fp.set_preference("permissions.default.stylesheet" ,2)
fp.set_preference("permissions.default.image" ,2)

driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\\Program Files\\geckodriver-v0.26.0-win64\\geckodriver.exe")
return driver

def readlinkfile(sourcename='tangshisanbaishou.txt'):
with open(sourcename, 'rt', encoding='utf-8') as f:
data = f.read()

all_links = []
mylinks = re.split('\n',data)

for x in mylinks:
all_links.append(x)
f.close()
return all_links

def writeTxtFile(data,outfilename):
with open(outfilename+'.txt', 'wt',encoding='utf-8') as f:
for m in data:
f.write(m+'\n')
f.close()

def writeJsonFile(data,outfilename):
with open(outfilename+'.json', 'wt',encoding='utf-8') as f:
for m in data:
json.dump(m,f,ensure_ascii=False,indent=4)
f.close()

def writeJsonFileAddEndFile(data,outfilename):
with open(outfilename+'.json', 'a',encoding='utf-8') as f:
for m in data:
json.dump(m,f,ensure_ascii=False,indent=4)
f.close()

def action(driver,link):
url = "https://so.gushiwen.org" +link
driver.get(url)

booklinks = []
elements = driver.find_elements_by_css_selector(".bookcont a")

title = driver.find_element_by_css_selector("h1").text

# print(elements.text)
for e in elements:
# print(e.get_property('href'))
booklinks.append(e.get_property('href'))

writeTxtFile(booklinks,'./onebooklink/'+ title)

def createLinkList():
driver = first()
links = readlinkfile('gujilinks.txt')

for link in links:
action(driver,link)
# break

driver.quit()

# createLinkList()

def soup(gushiurl):
# assert gushiurl
if not gushiurl:
return
# gushiurl = str("https://***" + gushiurl)
print(gushiurl)

html = urlopen(gushiurl).read().decode('utf-8')
# print(html)

soup = BeautifulSoup(html, features='lxml')

contsons = soup.find_all('div', {"class": "contson"})
title = soup.find_all('h1')

h1= title[0].get_text().replace('\n译注\n\n','')
text = []

for item in contsons:
text.append(item.get_text())

temp= {
'title':h1,
'text':text
}

contents = []
soup = None
contents.append(temp)
return contents

path = "G:\\workspace\\python\\selenium\\guji\\restlinks" #文件夹目录
Files_Global = []

def file_name_walk(file_dir):
for files in os.listdir(file_dir):
Files_Global.append(files) # 当前路径下所有非目录子文件

def getOne(name):
links = readlinkfile('./onebooklink/'+name+'.txt')
for link in links:
contents = soup(link)
writeJsonFileAddEndFile(contents,'./gujisourse/'+name)

def getOne2(name):
print(name)
links = readlinkfile('./restlinks/'+name)
index = 0
for link in links:
time.sleep(0.2)
print(index)
contents = soup(link)
name = name.replace('.txt','')
if contents:
writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
index += 1

def run():

file_name_walk(path)

for name in Files_Global:
print(name)

try:
getOne2(name)
except Exception as e:
raise e

# break

# run()

XI 1985

搜索此博客

如何爬取网页？可运行的python脚本

标签

评论

发表评论

此博客中的热门博文

了解视频广告格式

设置转化跟踪来实现目标

5. 采用 Google 展示广告投放最合适的广告内容