添加微信公众爬虫,图片格式特殊还为解决

2024-04-23 00:39:40 +08:00 · 2024-04-23 00:39:40 +08:00 · 44dcb062e0
parent ee0478010d
commit 44dcb062e0
2 changed files with 198 additions and 2 deletions
--- a/crawler.py
+++ b/crawler.py
@ -399,10 +399,19 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
                    except:
                        continue
                if response.status_code==200:
-                    article += ''' <img src="%d.jpg" width="100%%"/> \n\n'''%number
                    # article += '''<img src="%d.jpg"/>'''%number
                    with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
                        obj.write(response.content)
+
+                    files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
+                    response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
+                                               files=files,timeout=30)
+                    if response1.status_code==200:
+                        jsons = json.loads(response1.text)
+                        print(jsons)
+                        article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
+
+                    requests.put(imglink, timeout=30)
                    number += 1
                    crawlsleep(sleeptime)
        elif tag_name=="div":
@ -534,6 +543,7 @@ def recursion(nod, article, number, driver, dircrea, bk=False):
                    # article += '''<img src="%d.jpg"/>'''%number
                    with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
                        obj.write(response.content)
+                    response = requests.post(url, files=files)
                    number += 1
                    crawlsleep(sleeptime)
    return article, number
@ -632,7 +642,7 @@ def crawl_article_detail(driver:webdriver):
            # article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
            article = ""
            number = 0
-            
+            article =  article + '[转载自](%s)' % website + '\n\n'
            # for nod in article_childNodes:
                # article, number = recursion(nod, article, number, driver, dircrea)

--- a/wechat.py
+++ b/wechat.py
@ -0,0 +1,186 @@
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+import time
+import os
+from selenium.webdriver.edge.service import Service
+from selenium.webdriver import EdgeOptions
+import requests
+import json
+from selenium.webdriver.common.by import By
+
+def crawlsleep(times):
+    time.sleep(times)
+
+def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
+    if not innerHTML:
+        return article, number
+    if bk:
+        article += "**"
+    if isinstance(innerHTML, str):
+        article += innerHTML.text
+        return article, number
+
+    for chi in innerHTML.children:
+        # article, number = parser_beautiful(chi, article, number, dircrea, bk)
+        tag_name = chi.name
+        if isinstance(chi, str):
+            article += chi.text
+            continue
+        else:
+            cll = [c for c in chi.children]
+        if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
+            article, number = parser_beautiful(chi, article, number, dircrea, bk)
+        elif tag_name=="br":
+            article += "\n"
+        elif tag_name=="p":
+            article, number = parser_beautiful(chi, article, number, dircrea, bk)
+            article += "\n\n"
+        # elif tag_name=="br":
+        #     article += "<br>\n"
+        elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            article += '#' * int(tag_name[-1]) + ' '
+            article, number = parser_beautiful(chi, article, number, dircrea, bk)
+            article += '\n\n'
+        elif tag_name=="span":
+            datatex = None
+            classc = None
+            if 'data-tex' in chi.attrs.keys():
+                datatex = chi.attrs["data-tex"]
+            if 'class' in chi.attrs.keys():
+                classc = chi.attrs["class"]
+            if datatex and classc and 'ztext-math' in classc:
+                content = chi.attrs["data-tex"]
+                while len(content) > 0 and ' '==content[0]:
+                    content = content[1:]
+                while len(content) > 0 and ' '==content[-1]:
+                    content = content[:-1]
+                if len(content) > 0:
+                    if article[-3-1:]=='<br>' or article[-1:]=='\n':
+                        article += "\n$" + content + "$"
+                    else:
+                        article += "$" + content + "$"
+            else:
+                article, number = parser_beautiful(chi, article, number, dircrea, bk)
+                # article += nod.text
+        elif tag_name=="a":
+            linksite = None
+            if 'href' in chi.attrs.keys():
+                linksite = chi.attrs['href']
+            if linksite:
+                linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
+                if len(article) > 0 and article[-1]=='\n':
+                    article += "["+chi.text+"]"+"("+linksite + ")"
+                else:
+                    article += "\n\n["+chi.text+"]"+"("+linksite + ")"
+        elif tag_name=='b' or tag_name=='strong':
+            if len(cll) > 1:
+                article, number = parser_beautiful(chi, article, number, dircrea, True)
+            else:
+                txt = chi.text
+                while len(txt) > 0 and txt[-1] == " ":
+                    txt = txt[:-1]
+                article += " **" + txt + "** "
+        elif tag_name=="figure":
+            noscript = chi.find_all('noscript')
+            if len(noscript) > 0:
+                chi.noscript.extract()
+            imgchunk = chi.find_all('img')
+            for i in range(len(imgchunk)):
+                imglink = None
+                if 'data-original' in imgchunk[i].attrs.keys():
+                    imglink = imgchunk[i].attrs["data-original"]
+
+                if 'data-actualsrc' in imgchunk[i].attrs.keys():
+                    imglink = imgchunk[i].attrs['data-actualsrc']
+
+                if imglink==None:
+                    imglink = imgchunk[i].attrs["src"]
+                try:
+                    response = requests.get(imglink, timeout=30)
+                except:
+                    try:
+                        response = requests.get(imglink, timeout=30)
+                    except:
+                        continue
+                if response.status_code==200:
+                    with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
+                        obj.write(response.content)
+
+                    files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
+                    response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
+                                               files=files,timeout=30)
+                    if response1.status_code==200:
+                        continue
+                        # jsons = json.loads(response1.text)
+                        # print(jsons)
+                        # article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
+
+                    requests.put(imglink, timeout=30)
+                    number += 1
+                    crawlsleep(1)
+        elif tag_name=="div":
+            prenode = chi.find_all('code')
+            if len(prenode) > 0:
+                for i in prenode:
+                    article += "\n\n```\n" + i.text + "\n```\n\n"
+            else:
+                article, number = parser_beautiful(chi, article, number, dircrea, bk)
+                article += "\n\n"
+    if bk:
+        article += "**"
+    return article, number
+
+
+# Xpanx.com 专业网络爬虫程序定制，加微信 LiteMango（付费）
+# 设置webdriver路径，替换为你的webdriver路径
+abspath = os.path.abspath(__file__)
+
+driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
+service = Service(executable_path=driverpath)
+edge_options = EdgeOptions()
+
+#https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec
+edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+edge_options.add_experimental_option('useAutomationExtension', False)
+edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
+edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹
+
+# 初始化webdriver
+driver = webdriver.Edge(options=edge_options, service = service)
+
+# 微信公众号文章的 URL
+url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect'
+
+# 打开页面
+driver.get(url)
+
+# 等待一定时间让页面加载完成
+time.sleep(5)
+
+# 获取页面的源代码
+html = driver.page_source
+
+# 解析 HTML
+soup = BeautifulSoup(html, 'html.parser')
+
+# 提取文章标题
+title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True)
+
+# 提取文章内容
+content = soup.find('div', {'id': 'js_content'})
+richtext = driver.find_element(By.TAG_NAME, "section")
+article = ''
+print(content)
+inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
+innerHTML = BeautifulSoup(inner, "html.parser")
+res,num = parser_beautiful(innerHTML,article,0,'d://',False)
+# 将 HTML 转换为 Markdown
+#markdown = md(str(content))
+
+# 将 Markdown 写入文件
+with open(f'{title}.md', 'w', encoding='utf-8') as f:
+    f.write(res)
+
+# 关闭webdriver
+driver.quit()