添加微信公众爬虫,图片格式特殊还为解决
parent
ee0478010d
commit
44dcb062e0
14
crawler.py
14
crawler.py
|
@ -399,10 +399,19 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
if response.status_code==200:
|
if response.status_code==200:
|
||||||
article += ''' <img src="%d.jpg" width="100%%"/> \n\n'''%number
|
|
||||||
# article += '''<img src="%d.jpg"/>'''%number
|
# article += '''<img src="%d.jpg"/>'''%number
|
||||||
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
|
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
|
||||||
obj.write(response.content)
|
obj.write(response.content)
|
||||||
|
|
||||||
|
files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
|
||||||
|
response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
|
||||||
|
files=files,timeout=30)
|
||||||
|
if response1.status_code==200:
|
||||||
|
jsons = json.loads(response1.text)
|
||||||
|
print(jsons)
|
||||||
|
article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
|
||||||
|
|
||||||
|
requests.put(imglink, timeout=30)
|
||||||
number += 1
|
number += 1
|
||||||
crawlsleep(sleeptime)
|
crawlsleep(sleeptime)
|
||||||
elif tag_name=="div":
|
elif tag_name=="div":
|
||||||
|
@ -534,6 +543,7 @@ def recursion(nod, article, number, driver, dircrea, bk=False):
|
||||||
# article += '''<img src="%d.jpg"/>'''%number
|
# article += '''<img src="%d.jpg"/>'''%number
|
||||||
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
|
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
|
||||||
obj.write(response.content)
|
obj.write(response.content)
|
||||||
|
response = requests.post(url, files=files)
|
||||||
number += 1
|
number += 1
|
||||||
crawlsleep(sleeptime)
|
crawlsleep(sleeptime)
|
||||||
return article, number
|
return article, number
|
||||||
|
@ -632,7 +642,7 @@ def crawl_article_detail(driver:webdriver):
|
||||||
# article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
|
# article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
|
||||||
article = ""
|
article = ""
|
||||||
number = 0
|
number = 0
|
||||||
|
article = article + '[转载自](%s)' % website + '\n\n'
|
||||||
# for nod in article_childNodes:
|
# for nod in article_childNodes:
|
||||||
# article, number = recursion(nod, article, number, driver, dircrea)
|
# article, number = recursion(nod, article, number, driver, dircrea)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,186 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
from selenium.webdriver.edge.service import Service
|
||||||
|
from selenium.webdriver import EdgeOptions
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
|
def crawlsleep(times):
|
||||||
|
time.sleep(times)
|
||||||
|
|
||||||
|
def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
|
||||||
|
if not innerHTML:
|
||||||
|
return article, number
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
|
if isinstance(innerHTML, str):
|
||||||
|
article += innerHTML.text
|
||||||
|
return article, number
|
||||||
|
|
||||||
|
for chi in innerHTML.children:
|
||||||
|
# article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
tag_name = chi.name
|
||||||
|
if isinstance(chi, str):
|
||||||
|
article += chi.text
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
cll = [c for c in chi.children]
|
||||||
|
if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
elif tag_name=="br":
|
||||||
|
article += "\n"
|
||||||
|
elif tag_name=="p":
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
article += "\n\n"
|
||||||
|
# elif tag_name=="br":
|
||||||
|
# article += "<br>\n"
|
||||||
|
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
article += '#' * int(tag_name[-1]) + ' '
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
article += '\n\n'
|
||||||
|
elif tag_name=="span":
|
||||||
|
datatex = None
|
||||||
|
classc = None
|
||||||
|
if 'data-tex' in chi.attrs.keys():
|
||||||
|
datatex = chi.attrs["data-tex"]
|
||||||
|
if 'class' in chi.attrs.keys():
|
||||||
|
classc = chi.attrs["class"]
|
||||||
|
if datatex and classc and 'ztext-math' in classc:
|
||||||
|
content = chi.attrs["data-tex"]
|
||||||
|
while len(content) > 0 and ' '==content[0]:
|
||||||
|
content = content[1:]
|
||||||
|
while len(content) > 0 and ' '==content[-1]:
|
||||||
|
content = content[:-1]
|
||||||
|
if len(content) > 0:
|
||||||
|
if article[-3-1:]=='<br>' or article[-1:]=='\n':
|
||||||
|
article += "\n$" + content + "$"
|
||||||
|
else:
|
||||||
|
article += "$" + content + "$"
|
||||||
|
else:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
# article += nod.text
|
||||||
|
elif tag_name=="a":
|
||||||
|
linksite = None
|
||||||
|
if 'href' in chi.attrs.keys():
|
||||||
|
linksite = chi.attrs['href']
|
||||||
|
if linksite:
|
||||||
|
linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
|
||||||
|
if len(article) > 0 and article[-1]=='\n':
|
||||||
|
article += "["+chi.text+"]"+"("+linksite + ")"
|
||||||
|
else:
|
||||||
|
article += "\n\n["+chi.text+"]"+"("+linksite + ")"
|
||||||
|
elif tag_name=='b' or tag_name=='strong':
|
||||||
|
if len(cll) > 1:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, True)
|
||||||
|
else:
|
||||||
|
txt = chi.text
|
||||||
|
while len(txt) > 0 and txt[-1] == " ":
|
||||||
|
txt = txt[:-1]
|
||||||
|
article += " **" + txt + "** "
|
||||||
|
elif tag_name=="figure":
|
||||||
|
noscript = chi.find_all('noscript')
|
||||||
|
if len(noscript) > 0:
|
||||||
|
chi.noscript.extract()
|
||||||
|
imgchunk = chi.find_all('img')
|
||||||
|
for i in range(len(imgchunk)):
|
||||||
|
imglink = None
|
||||||
|
if 'data-original' in imgchunk[i].attrs.keys():
|
||||||
|
imglink = imgchunk[i].attrs["data-original"]
|
||||||
|
|
||||||
|
if 'data-actualsrc' in imgchunk[i].attrs.keys():
|
||||||
|
imglink = imgchunk[i].attrs['data-actualsrc']
|
||||||
|
|
||||||
|
if imglink==None:
|
||||||
|
imglink = imgchunk[i].attrs["src"]
|
||||||
|
try:
|
||||||
|
response = requests.get(imglink, timeout=30)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
response = requests.get(imglink, timeout=30)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
if response.status_code==200:
|
||||||
|
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
|
||||||
|
obj.write(response.content)
|
||||||
|
|
||||||
|
files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
|
||||||
|
response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
|
||||||
|
files=files,timeout=30)
|
||||||
|
if response1.status_code==200:
|
||||||
|
continue
|
||||||
|
# jsons = json.loads(response1.text)
|
||||||
|
# print(jsons)
|
||||||
|
# article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
|
||||||
|
|
||||||
|
requests.put(imglink, timeout=30)
|
||||||
|
number += 1
|
||||||
|
crawlsleep(1)
|
||||||
|
elif tag_name=="div":
|
||||||
|
prenode = chi.find_all('code')
|
||||||
|
if len(prenode) > 0:
|
||||||
|
for i in prenode:
|
||||||
|
article += "\n\n```\n" + i.text + "\n```\n\n"
|
||||||
|
else:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
article += "\n\n"
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
|
return article, number
|
||||||
|
|
||||||
|
|
||||||
|
# Xpanx.com 专业网络爬虫程序定制,加微信 LiteMango(付费)
|
||||||
|
# 设置webdriver路径,替换为你的webdriver路径
|
||||||
|
abspath = os.path.abspath(__file__)
|
||||||
|
|
||||||
|
driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
|
||||||
|
service = Service(executable_path=driverpath)
|
||||||
|
edge_options = EdgeOptions()
|
||||||
|
|
||||||
|
#https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec
|
||||||
|
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||||
|
edge_options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
|
||||||
|
edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹
|
||||||
|
|
||||||
|
# 初始化webdriver
|
||||||
|
driver = webdriver.Edge(options=edge_options, service = service)
|
||||||
|
|
||||||
|
# 微信公众号文章的 URL
|
||||||
|
url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect'
|
||||||
|
|
||||||
|
# 打开页面
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# 等待一定时间让页面加载完成
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# 获取页面的源代码
|
||||||
|
html = driver.page_source
|
||||||
|
|
||||||
|
# 解析 HTML
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# 提取文章标题
|
||||||
|
title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True)
|
||||||
|
|
||||||
|
# 提取文章内容
|
||||||
|
content = soup.find('div', {'id': 'js_content'})
|
||||||
|
richtext = driver.find_element(By.TAG_NAME, "section")
|
||||||
|
article = ''
|
||||||
|
print(content)
|
||||||
|
inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
|
||||||
|
innerHTML = BeautifulSoup(inner, "html.parser")
|
||||||
|
res,num = parser_beautiful(innerHTML,article,0,'d://',False)
|
||||||
|
# 将 HTML 转换为 Markdown
|
||||||
|
#markdown = md(str(content))
|
||||||
|
|
||||||
|
# 将 Markdown 写入文件
|
||||||
|
with open(f'{title}.md', 'w', encoding='utf-8') as f:
|
||||||
|
f.write(res)
|
||||||
|
|
||||||
|
# 关闭webdriver
|
||||||
|
driver.quit()
|
Loading…
Reference in New Issue