From 44dcb062e0f1f44ac413888b94613dee229406b5 Mon Sep 17 00:00:00 2001
From: zcy <290198252@qq.com>
Date: Tue, 23 Apr 2024 00:39:40 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=BE=AE=E4=BF=A1=E5=85=AC?=
=?UTF-8?q?=E4=BC=97=E7=88=AC=E8=99=AB,=E5=9B=BE=E7=89=87=E6=A0=BC?=
=?UTF-8?q?=E5=BC=8F=E7=89=B9=E6=AE=8A=E8=BF=98=E4=B8=BA=E8=A7=A3=E5=86=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
crawler.py | 14 +++-
wechat.py | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 198 insertions(+), 2 deletions(-)
create mode 100644 wechat.py
diff --git a/crawler.py b/crawler.py
index 65a2d13..d50b267 100644
--- a/crawler.py
+++ b/crawler.py
@@ -399,10 +399,19 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
except:
continue
if response.status_code==200:
- article += ''' \n\n'''%number
# article += ''''''%number
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
obj.write(response.content)
+
+ files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
+ response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
+ files=files,timeout=30)
+ if response1.status_code==200:
+ jsons = json.loads(response1.text)
+ print(jsons)
+ article += ''' \n\n'''%jsons['url']
+
+ requests.put(imglink, timeout=30)
number += 1
crawlsleep(sleeptime)
elif tag_name=="div":
@@ -534,6 +543,7 @@ def recursion(nod, article, number, driver, dircrea, bk=False):
# article += ''''''%number
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
obj.write(response.content)
+ response = requests.post(url, files=files)
number += 1
crawlsleep(sleeptime)
return article, number
@@ -632,7 +642,7 @@ def crawl_article_detail(driver:webdriver):
# article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
article = ""
number = 0
-
+ article = article + '[转载自](%s)' % website + '\n\n'
# for nod in article_childNodes:
# article, number = recursion(nod, article, number, driver, dircrea)
diff --git a/wechat.py b/wechat.py
new file mode 100644
index 0000000..67365e4
--- /dev/null
+++ b/wechat.py
@@ -0,0 +1,186 @@
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+import time
+import os
+from selenium.webdriver.edge.service import Service
+from selenium.webdriver import EdgeOptions
+import requests
+import json
+from selenium.webdriver.common.by import By
+
+def crawlsleep(times):
+ time.sleep(times)
+
+def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
+ if not innerHTML:
+ return article, number
+ if bk:
+ article += "**"
+ if isinstance(innerHTML, str):
+ article += innerHTML.text
+ return article, number
+
+ for chi in innerHTML.children:
+ # article, number = parser_beautiful(chi, article, number, dircrea, bk)
+ tag_name = chi.name
+ if isinstance(chi, str):
+ article += chi.text
+ continue
+ else:
+ cll = [c for c in chi.children]
+ if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
+ article, number = parser_beautiful(chi, article, number, dircrea, bk)
+ elif tag_name=="br":
+ article += "\n"
+ elif tag_name=="p":
+ article, number = parser_beautiful(chi, article, number, dircrea, bk)
+ article += "\n\n"
+ # elif tag_name=="br":
+ # article += "
\n"
+ elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+ article += '#' * int(tag_name[-1]) + ' '
+ article, number = parser_beautiful(chi, article, number, dircrea, bk)
+ article += '\n\n'
+ elif tag_name=="span":
+ datatex = None
+ classc = None
+ if 'data-tex' in chi.attrs.keys():
+ datatex = chi.attrs["data-tex"]
+ if 'class' in chi.attrs.keys():
+ classc = chi.attrs["class"]
+ if datatex and classc and 'ztext-math' in classc:
+ content = chi.attrs["data-tex"]
+ while len(content) > 0 and ' '==content[0]:
+ content = content[1:]
+ while len(content) > 0 and ' '==content[-1]:
+ content = content[:-1]
+ if len(content) > 0:
+ if article[-3-1:]=='
' or article[-1:]=='\n':
+ article += "\n$" + content + "$"
+ else:
+ article += "$" + content + "$"
+ else:
+ article, number = parser_beautiful(chi, article, number, dircrea, bk)
+ # article += nod.text
+ elif tag_name=="a":
+ linksite = None
+ if 'href' in chi.attrs.keys():
+ linksite = chi.attrs['href']
+ if linksite:
+ linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
+ if len(article) > 0 and article[-1]=='\n':
+ article += "["+chi.text+"]"+"("+linksite + ")"
+ else:
+ article += "\n\n["+chi.text+"]"+"("+linksite + ")"
+ elif tag_name=='b' or tag_name=='strong':
+ if len(cll) > 1:
+ article, number = parser_beautiful(chi, article, number, dircrea, True)
+ else:
+ txt = chi.text
+ while len(txt) > 0 and txt[-1] == " ":
+ txt = txt[:-1]
+ article += " **" + txt + "** "
+ elif tag_name=="figure":
+ noscript = chi.find_all('noscript')
+ if len(noscript) > 0:
+ chi.noscript.extract()
+ imgchunk = chi.find_all('img')
+ for i in range(len(imgchunk)):
+ imglink = None
+ if 'data-original' in imgchunk[i].attrs.keys():
+ imglink = imgchunk[i].attrs["data-original"]
+
+ if 'data-actualsrc' in imgchunk[i].attrs.keys():
+ imglink = imgchunk[i].attrs['data-actualsrc']
+
+ if imglink==None:
+ imglink = imgchunk[i].attrs["src"]
+ try:
+ response = requests.get(imglink, timeout=30)
+ except:
+ try:
+ response = requests.get(imglink, timeout=30)
+ except:
+ continue
+ if response.status_code==200:
+ with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
+ obj.write(response.content)
+
+ files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
+ response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
+ files=files,timeout=30)
+ if response1.status_code==200:
+ continue
+ # jsons = json.loads(response1.text)
+ # print(jsons)
+ # article += ''' \n\n'''%jsons['url']
+
+ requests.put(imglink, timeout=30)
+ number += 1
+ crawlsleep(1)
+ elif tag_name=="div":
+ prenode = chi.find_all('code')
+ if len(prenode) > 0:
+ for i in prenode:
+ article += "\n\n```\n" + i.text + "\n```\n\n"
+ else:
+ article, number = parser_beautiful(chi, article, number, dircrea, bk)
+ article += "\n\n"
+ if bk:
+ article += "**"
+ return article, number
+
+
+# Xpanx.com 专业网络爬虫程序定制,加微信 LiteMango(付费)
+# 设置webdriver路径,替换为你的webdriver路径
+abspath = os.path.abspath(__file__)
+
+driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
+service = Service(executable_path=driverpath)
+edge_options = EdgeOptions()
+
+#https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec
+edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+edge_options.add_experimental_option('useAutomationExtension', False)
+edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
+edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹
+
+# 初始化webdriver
+driver = webdriver.Edge(options=edge_options, service = service)
+
+# 微信公众号文章的 URL
+url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect'
+
+# 打开页面
+driver.get(url)
+
+# 等待一定时间让页面加载完成
+time.sleep(5)
+
+# 获取页面的源代码
+html = driver.page_source
+
+# 解析 HTML
+soup = BeautifulSoup(html, 'html.parser')
+
+# 提取文章标题
+title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True)
+
+# 提取文章内容
+content = soup.find('div', {'id': 'js_content'})
+richtext = driver.find_element(By.TAG_NAME, "section")
+article = ''
+print(content)
+inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
+innerHTML = BeautifulSoup(inner, "html.parser")
+res,num = parser_beautiful(innerHTML,article,0,'d://',False)
+# 将 HTML 转换为 Markdown
+#markdown = md(str(content))
+
+# 将 Markdown 写入文件
+with open(f'{title}.md', 'w', encoding='utf-8') as f:
+ f.write(res)
+
+# 关闭webdriver
+driver.quit()
\ No newline at end of file