添加微信公众爬虫,图片格式特殊还为解决

master
zcy 2024-04-23 00:39:40 +08:00
parent ee0478010d
commit 44dcb062e0
2 changed files with 198 additions and 2 deletions

View File

@ -399,10 +399,19 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
except:
continue
if response.status_code==200:
article += ''' <img src="%d.jpg" width="100%%"/> \n\n'''%number
# article += '''<img src="%d.jpg"/>'''%number
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
obj.write(response.content)
files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
files=files,timeout=30)
if response1.status_code==200:
jsons = json.loads(response1.text)
print(jsons)
article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
requests.put(imglink, timeout=30)
number += 1
crawlsleep(sleeptime)
elif tag_name=="div":
@ -534,6 +543,7 @@ def recursion(nod, article, number, driver, dircrea, bk=False):
# article += '''<img src="%d.jpg"/>'''%number
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
obj.write(response.content)
response = requests.post(url, files=files)
number += 1
crawlsleep(sleeptime)
return article, number
@ -632,7 +642,7 @@ def crawl_article_detail(driver:webdriver):
# article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
article = ""
number = 0
article = article + '[转载自](%s)' % website + '\n\n'
# for nod in article_childNodes:
# article, number = recursion(nod, article, number, driver, dircrea)

186
wechat.py Normal file
View File

@ -0,0 +1,186 @@
from selenium import webdriver
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import time
import os
from selenium.webdriver.edge.service import Service
from selenium.webdriver import EdgeOptions
import requests
import json
from selenium.webdriver.common.by import By
def crawlsleep(times):
time.sleep(times)
def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
if not innerHTML:
return article, number
if bk:
article += "**"
if isinstance(innerHTML, str):
article += innerHTML.text
return article, number
for chi in innerHTML.children:
# article, number = parser_beautiful(chi, article, number, dircrea, bk)
tag_name = chi.name
if isinstance(chi, str):
article += chi.text
continue
else:
cll = [c for c in chi.children]
if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
elif tag_name=="br":
article += "\n"
elif tag_name=="p":
article, number = parser_beautiful(chi, article, number, dircrea, bk)
article += "\n\n"
# elif tag_name=="br":
# article += "<br>\n"
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
article += '#' * int(tag_name[-1]) + ' '
article, number = parser_beautiful(chi, article, number, dircrea, bk)
article += '\n\n'
elif tag_name=="span":
datatex = None
classc = None
if 'data-tex' in chi.attrs.keys():
datatex = chi.attrs["data-tex"]
if 'class' in chi.attrs.keys():
classc = chi.attrs["class"]
if datatex and classc and 'ztext-math' in classc:
content = chi.attrs["data-tex"]
while len(content) > 0 and ' '==content[0]:
content = content[1:]
while len(content) > 0 and ' '==content[-1]:
content = content[:-1]
if len(content) > 0:
if article[-3-1:]=='<br>' or article[-1:]=='\n':
article += "\n$" + content + "$"
else:
article += "$" + content + "$"
else:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
# article += nod.text
elif tag_name=="a":
linksite = None
if 'href' in chi.attrs.keys():
linksite = chi.attrs['href']
if linksite:
linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
if len(article) > 0 and article[-1]=='\n':
article += "["+chi.text+"]"+"("+linksite + ")"
else:
article += "\n\n["+chi.text+"]"+"("+linksite + ")"
elif tag_name=='b' or tag_name=='strong':
if len(cll) > 1:
article, number = parser_beautiful(chi, article, number, dircrea, True)
else:
txt = chi.text
while len(txt) > 0 and txt[-1] == " ":
txt = txt[:-1]
article += " **" + txt + "** "
elif tag_name=="figure":
noscript = chi.find_all('noscript')
if len(noscript) > 0:
chi.noscript.extract()
imgchunk = chi.find_all('img')
for i in range(len(imgchunk)):
imglink = None
if 'data-original' in imgchunk[i].attrs.keys():
imglink = imgchunk[i].attrs["data-original"]
if 'data-actualsrc' in imgchunk[i].attrs.keys():
imglink = imgchunk[i].attrs['data-actualsrc']
if imglink==None:
imglink = imgchunk[i].attrs["src"]
try:
response = requests.get(imglink, timeout=30)
except:
try:
response = requests.get(imglink, timeout=30)
except:
continue
if response.status_code==200:
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
obj.write(response.content)
files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
files=files,timeout=30)
if response1.status_code==200:
continue
# jsons = json.loads(response1.text)
# print(jsons)
# article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
requests.put(imglink, timeout=30)
number += 1
crawlsleep(1)
elif tag_name=="div":
prenode = chi.find_all('code')
if len(prenode) > 0:
for i in prenode:
article += "\n\n```\n" + i.text + "\n```\n\n"
else:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
article += "\n\n"
if bk:
article += "**"
return article, number
# Xpanx.com 专业网络爬虫程序定制,加微信 LiteMango付费
# 设置webdriver路径替换为你的webdriver路径
abspath = os.path.abspath(__file__)
driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
service = Service(executable_path=driverpath)
edge_options = EdgeOptions()
#https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
edge_options.add_experimental_option('useAutomationExtension', False)
edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹
# 初始化webdriver
driver = webdriver.Edge(options=edge_options, service = service)
# 微信公众号文章的 URL
url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect'
# 打开页面
driver.get(url)
# 等待一定时间让页面加载完成
time.sleep(5)
# 获取页面的源代码
html = driver.page_source
# 解析 HTML
soup = BeautifulSoup(html, 'html.parser')
# 提取文章标题
title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True)
# 提取文章内容
content = soup.find('div', {'id': 'js_content'})
richtext = driver.find_element(By.TAG_NAME, "section")
article = ''
print(content)
inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
innerHTML = BeautifulSoup(inner, "html.parser")
res,num = parser_beautiful(innerHTML,article,0,'d://',False)
# 将 HTML 转换为 Markdown
#markdown = md(str(content))
# 将 Markdown 写入文件
with open(f'{title}.md', 'w', encoding='utf-8') as f:
f.write(res)
# 关闭webdriver
driver.quit()