From 44dcb062e0f1f44ac413888b94613dee229406b5 Mon Sep 17 00:00:00 2001 From: zcy <290198252@qq.com> Date: Tue, 23 Apr 2024 00:39:40 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=BE=AE=E4=BF=A1=E5=85=AC?= =?UTF-8?q?=E4=BC=97=E7=88=AC=E8=99=AB,=E5=9B=BE=E7=89=87=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F=E7=89=B9=E6=AE=8A=E8=BF=98=E4=B8=BA=E8=A7=A3=E5=86=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler.py | 14 +++- wechat.py | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 wechat.py diff --git a/crawler.py b/crawler.py index 65a2d13..d50b267 100644 --- a/crawler.py +++ b/crawler.py @@ -399,10 +399,19 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False): except: continue if response.status_code==200: - article += ''' \n\n'''%number # article += ''''''%number with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj: obj.write(response.content) + + files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')} + response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload', + files=files,timeout=30) + if response1.status_code==200: + jsons = json.loads(response1.text) + print(jsons) + article += ''' \n\n'''%jsons['url'] + + requests.put(imglink, timeout=30) number += 1 crawlsleep(sleeptime) elif tag_name=="div": @@ -534,6 +543,7 @@ def recursion(nod, article, number, driver, dircrea, bk=False): # article += ''''''%number with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj: obj.write(response.content) + response = requests.post(url, files=files) number += 1 crawlsleep(sleeptime) return article, number @@ -632,7 +642,7 @@ def crawl_article_detail(driver:webdriver): # article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext) article = "" number = 0 - + article = article + '[转载自](%s)' % website + '\n\n' # for nod in article_childNodes: # article, number = recursion(nod, article, number, driver, dircrea) diff --git a/wechat.py b/wechat.py new file mode 100644 index 0000000..67365e4 --- /dev/null +++ b/wechat.py @@ -0,0 +1,186 @@ +from selenium import webdriver +from bs4 import BeautifulSoup +from markdownify import markdownify as md +import time +import os +from selenium.webdriver.edge.service import Service +from selenium.webdriver import EdgeOptions +import requests +import json +from selenium.webdriver.common.by import By + +def crawlsleep(times): + time.sleep(times) + +def parser_beautiful(innerHTML, article, number, dircrea, bk=False): + if not innerHTML: + return article, number + if bk: + article += "**" + if isinstance(innerHTML, str): + article += innerHTML.text + return article, number + + for chi in innerHTML.children: + # article, number = parser_beautiful(chi, article, number, dircrea, bk) + tag_name = chi.name + if isinstance(chi, str): + article += chi.text + continue + else: + cll = [c for c in chi.children] + if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']: + article, number = parser_beautiful(chi, article, number, dircrea, bk) + elif tag_name=="br": + article += "\n" + elif tag_name=="p": + article, number = parser_beautiful(chi, article, number, dircrea, bk) + article += "\n\n" + # elif tag_name=="br": + # article += "
\n" + elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + article += '#' * int(tag_name[-1]) + ' ' + article, number = parser_beautiful(chi, article, number, dircrea, bk) + article += '\n\n' + elif tag_name=="span": + datatex = None + classc = None + if 'data-tex' in chi.attrs.keys(): + datatex = chi.attrs["data-tex"] + if 'class' in chi.attrs.keys(): + classc = chi.attrs["class"] + if datatex and classc and 'ztext-math' in classc: + content = chi.attrs["data-tex"] + while len(content) > 0 and ' '==content[0]: + content = content[1:] + while len(content) > 0 and ' '==content[-1]: + content = content[:-1] + if len(content) > 0: + if article[-3-1:]=='
' or article[-1:]=='\n': + article += "\n$" + content + "$" + else: + article += "$" + content + "$" + else: + article, number = parser_beautiful(chi, article, number, dircrea, bk) + # article += nod.text + elif tag_name=="a": + linksite = None + if 'href' in chi.attrs.keys(): + linksite = chi.attrs['href'] + if linksite: + linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "") + if len(article) > 0 and article[-1]=='\n': + article += "["+chi.text+"]"+"("+linksite + ")" + else: + article += "\n\n["+chi.text+"]"+"("+linksite + ")" + elif tag_name=='b' or tag_name=='strong': + if len(cll) > 1: + article, number = parser_beautiful(chi, article, number, dircrea, True) + else: + txt = chi.text + while len(txt) > 0 and txt[-1] == " ": + txt = txt[:-1] + article += " **" + txt + "** " + elif tag_name=="figure": + noscript = chi.find_all('noscript') + if len(noscript) > 0: + chi.noscript.extract() + imgchunk = chi.find_all('img') + for i in range(len(imgchunk)): + imglink = None + if 'data-original' in imgchunk[i].attrs.keys(): + imglink = imgchunk[i].attrs["data-original"] + + if 'data-actualsrc' in imgchunk[i].attrs.keys(): + imglink = imgchunk[i].attrs['data-actualsrc'] + + if imglink==None: + imglink = imgchunk[i].attrs["src"] + try: + response = requests.get(imglink, timeout=30) + except: + try: + response = requests.get(imglink, timeout=30) + except: + continue + if response.status_code==200: + with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj: + obj.write(response.content) + + files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')} + response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload', + files=files,timeout=30) + if response1.status_code==200: + continue + # jsons = json.loads(response1.text) + # print(jsons) + # article += ''' \n\n'''%jsons['url'] + + requests.put(imglink, timeout=30) + number += 1 + crawlsleep(1) + elif tag_name=="div": + prenode = chi.find_all('code') + if len(prenode) > 0: + for i in prenode: + article += "\n\n```\n" + i.text + "\n```\n\n" + else: + article, number = parser_beautiful(chi, article, number, dircrea, bk) + article += "\n\n" + if bk: + article += "**" + return article, number + + +# Xpanx.com 专业网络爬虫程序定制,加微信 LiteMango(付费) +# 设置webdriver路径,替换为你的webdriver路径 +abspath = os.path.abspath(__file__) + +driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe') +service = Service(executable_path=driverpath) +edge_options = EdgeOptions() + +#https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec +edge_options.add_experimental_option('excludeSwitches', ['enable-automation']) +edge_options.add_experimental_option('useAutomationExtension', False) +edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en') +edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹 + +# 初始化webdriver +driver = webdriver.Edge(options=edge_options, service = service) + +# 微信公众号文章的 URL +url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect' + +# 打开页面 +driver.get(url) + +# 等待一定时间让页面加载完成 +time.sleep(5) + +# 获取页面的源代码 +html = driver.page_source + +# 解析 HTML +soup = BeautifulSoup(html, 'html.parser') + +# 提取文章标题 +title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True) + +# 提取文章内容 +content = soup.find('div', {'id': 'js_content'}) +richtext = driver.find_element(By.TAG_NAME, "section") +article = '' +print(content) +inner = driver.execute_script("return arguments[0].innerHTML;", richtext) +innerHTML = BeautifulSoup(inner, "html.parser") +res,num = parser_beautiful(innerHTML,article,0,'d://',False) +# 将 HTML 转换为 Markdown +#markdown = md(str(content)) + +# 将 Markdown 写入文件 +with open(f'{title}.md', 'w', encoding='utf-8') as f: + f.write(res) + +# 关闭webdriver +driver.quit() \ No newline at end of file