#!/usr/bin/python
#-*-coding:utf-8-*- #指定编码格式,python默认unicode编码
import json,os,sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def first():
binary = r'C:\Program Files\Mozilla Firefox32\firefox.exe'
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True #optional
fp = webdriver.FirefoxProfile()
fp.set_preference("permissions.default.stylesheet" ,2)
fp.set_preference("permissions.default.image" ,2)
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\\Program Files\\geckodriver-v0.26.0-win64\\geckodriver.exe")
return driver
def readlinkfile(sourcename='tangshisanbaishou.txt'):
with open(sourcename, 'rt', encoding='utf-8') as f:
data = f.read()
all_links = []
mylinks = re.split('\n',data)
for x in mylinks:
all_links.append(x)
f.close()
return all_links
def writeTxtFile(data,outfilename):
with open(outfilename+'.txt', 'wt',encoding='utf-8') as f:
for m in data:
f.write(m+'\n')
f.close()
def writeJsonFile(data,outfilename):
with open(outfilename+'.json', 'wt',encoding='utf-8') as f:
for m in data:
json.dump(m,f,ensure_ascii=False,indent=4)
f.close()
def writeJsonFileAddEndFile(data,outfilename):
with open(outfilename+'.json', 'a',encoding='utf-8') as f:
for m in data:
json.dump(m,f,ensure_ascii=False,indent=4)
f.close()
def action(driver,link):
url = "https://so.gushiwen.org" +link
driver.get(url)
booklinks = []
elements = driver.find_elements_by_css_selector(".bookcont a")
title = driver.find_element_by_css_selector("h1").text
# print(elements.text)
for e in elements:
# print(e.get_property('href'))
booklinks.append(e.get_property('href'))
writeTxtFile(booklinks,'./onebooklink/'+ title)
def createLinkList():
driver = first()
links = readlinkfile('gujilinks.txt')
for link in links:
action(driver,link)
# break
driver.quit()
# createLinkList()
def soup(gushiurl):
# assert gushiurl
if not gushiurl:
return
# gushiurl = str("https://***" + gushiurl)
print(gushiurl)
html = urlopen(gushiurl).read().decode('utf-8')
# print(html)
soup = BeautifulSoup(html, features='lxml')
contsons = soup.find_all('div', {"class": "contson"})
title = soup.find_all('h1')
h1= title[0].get_text().replace('\n译注\n\n','')
text = []
for item in contsons:
text.append(item.get_text())
temp= {
'title':h1,
'text':text
}
contents = []
soup = None
contents.append(temp)
return contents
path = "G:\\workspace\\python\\selenium\\guji\\restlinks" #文件夹目录
Files_Global = []
def file_name_walk(file_dir):
for files in os.listdir(file_dir):
Files_Global.append(files) # 当前路径下所有非目录子文件
def getOne(name):
links = readlinkfile('./onebooklink/'+name+'.txt')
for link in links:
contents = soup(link)
writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
def getOne2(name):
print(name)
links = readlinkfile('./restlinks/'+name)
index = 0
for link in links:
time.sleep(0.2)
print(index)
contents = soup(link)
name = name.replace('.txt','')
if contents:
writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
index += 1
def run():
file_name_walk(path)
for name in Files_Global:
print(name)
try:
getOne2(name)
except Exception as e:
raise e
# break
# run()
#-*-coding:utf-8-*- #指定编码格式,python默认unicode编码
import json,os,sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def first():
binary = r'C:\Program Files\Mozilla Firefox32\firefox.exe'
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True #optional
fp = webdriver.FirefoxProfile()
fp.set_preference("permissions.default.stylesheet" ,2)
fp.set_preference("permissions.default.image" ,2)
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\\Program Files\\geckodriver-v0.26.0-win64\\geckodriver.exe")
return driver
def readlinkfile(sourcename='tangshisanbaishou.txt'):
with open(sourcename, 'rt', encoding='utf-8') as f:
data = f.read()
all_links = []
mylinks = re.split('\n',data)
for x in mylinks:
all_links.append(x)
f.close()
return all_links
def writeTxtFile(data,outfilename):
with open(outfilename+'.txt', 'wt',encoding='utf-8') as f:
for m in data:
f.write(m+'\n')
f.close()
def writeJsonFile(data,outfilename):
with open(outfilename+'.json', 'wt',encoding='utf-8') as f:
for m in data:
json.dump(m,f,ensure_ascii=False,indent=4)
f.close()
def writeJsonFileAddEndFile(data,outfilename):
with open(outfilename+'.json', 'a',encoding='utf-8') as f:
for m in data:
json.dump(m,f,ensure_ascii=False,indent=4)
f.close()
def action(driver,link):
url = "https://so.gushiwen.org" +link
driver.get(url)
booklinks = []
elements = driver.find_elements_by_css_selector(".bookcont a")
title = driver.find_element_by_css_selector("h1").text
# print(elements.text)
for e in elements:
# print(e.get_property('href'))
booklinks.append(e.get_property('href'))
writeTxtFile(booklinks,'./onebooklink/'+ title)
def createLinkList():
driver = first()
links = readlinkfile('gujilinks.txt')
for link in links:
action(driver,link)
# break
driver.quit()
# createLinkList()
def soup(gushiurl):
# assert gushiurl
if not gushiurl:
return
# gushiurl = str("https://***" + gushiurl)
print(gushiurl)
html = urlopen(gushiurl).read().decode('utf-8')
# print(html)
soup = BeautifulSoup(html, features='lxml')
contsons = soup.find_all('div', {"class": "contson"})
title = soup.find_all('h1')
h1= title[0].get_text().replace('\n译注\n\n','')
text = []
for item in contsons:
text.append(item.get_text())
temp= {
'title':h1,
'text':text
}
contents = []
soup = None
contents.append(temp)
return contents
path = "G:\\workspace\\python\\selenium\\guji\\restlinks" #文件夹目录
Files_Global = []
def file_name_walk(file_dir):
for files in os.listdir(file_dir):
Files_Global.append(files) # 当前路径下所有非目录子文件
def getOne(name):
links = readlinkfile('./onebooklink/'+name+'.txt')
for link in links:
contents = soup(link)
writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
def getOne2(name):
print(name)
links = readlinkfile('./restlinks/'+name)
index = 0
for link in links:
time.sleep(0.2)
print(index)
contents = soup(link)
name = name.replace('.txt','')
if contents:
writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
index += 1
def run():
file_name_walk(path)
for name in Files_Global:
print(name)
try:
getOne2(name)
except Exception as e:
raise e
# break
# run()
评论
发表评论