添加微信公众爬虫,图片格式特殊还为解决

2024-04-23 00:39:40 +08:00 · 2024-04-23 00:39:40 +08:00 · 44dcb062e0
parent ee0478010d
commit 44dcb062e0
2 changed files with 198 additions and 2 deletions
--- a/crawler.py
+++ b/crawler.py
@ -399,10 +399,19 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
                    except:
                        continue
                if response.status_code==200:
                    article += ''' <img src="%d.jpg" width="100%%"/> \n\n'''%number
                    # article += '''<img src="%d.jpg"/>'''%number
                    with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
                        obj.write(response.content)
                    files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
                    response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
                                               files=files,timeout=30)
                    if response1.status_code==200:
                        jsons = json.loads(response1.text)
                        print(jsons)
                        article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
                    requests.put(imglink, timeout=30)
                    number += 1
                    crawlsleep(sleeptime)
        elif tag_name=="div":
@ -534,6 +543,7 @@ def recursion(nod, article, number, driver, dircrea, bk=False):
                    # article += '''<img src="%d.jpg"/>'''%number
                    with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
                        obj.write(response.content)
                    response = requests.post(url, files=files)
                    number += 1
                    crawlsleep(sleeptime)
    return article, number
@ -632,7 +642,7 @@ def crawl_article_detail(driver:webdriver):
            # article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
            article = ""
            number = 0
-            
+            article =  article + '[转载自](%s)' % website + '\n\n'
            # for nod in article_childNodes:
                # article, number = recursion(nod, article, number, driver, dircrea)
--- a/wechat.py
+++ b/wechat.py
@ -0,0 +1,186 @@
 from selenium import webdriver
 from bs4 import BeautifulSoup
 from markdownify import markdownify as md
 import time
 import os
 from selenium.webdriver.edge.service import Service
 from selenium.webdriver import EdgeOptions
 import requests
 import json
 from selenium.webdriver.common.by import By
 def crawlsleep(times):
    time.sleep(times)
 def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
    if not innerHTML:
        return article, number
    if bk:
        article += "**"
    if isinstance(innerHTML, str):
        article += innerHTML.text
        return article, number
    for chi in innerHTML.children:
        # article, number = parser_beautiful(chi, article, number, dircrea, bk)
        tag_name = chi.name
        if isinstance(chi, str):
            article += chi.text
            continue
        else:
            cll = [c for c in chi.children]
        if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
            article, number = parser_beautiful(chi, article, number, dircrea, bk)
        elif tag_name=="br":
            article += "\n"
        elif tag_name=="p":
            article, number = parser_beautiful(chi, article, number, dircrea, bk)
            article += "\n\n"
        # elif tag_name=="br":
        #     article += "<br>\n"
        elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            article += '#' * int(tag_name[-1]) + ' '
            article, number = parser_beautiful(chi, article, number, dircrea, bk)
            article += '\n\n'
        elif tag_name=="span":
            datatex = None
            classc = None
            if 'data-tex' in chi.attrs.keys():
                datatex = chi.attrs["data-tex"]
            if 'class' in chi.attrs.keys():
                classc = chi.attrs["class"]
            if datatex and classc and 'ztext-math' in classc:
                content = chi.attrs["data-tex"]
                while len(content) > 0 and ' '==content[0]:
                    content = content[1:]
                while len(content) > 0 and ' '==content[-1]:
                    content = content[:-1]
                if len(content) > 0:
                    if article[-3-1:]=='<br>' or article[-1:]=='\n':
                        article += "\n$" + content + "$"
                    else:
                        article += "$" + content + "$"
            else:
                article, number = parser_beautiful(chi, article, number, dircrea, bk)
                # article += nod.text
        elif tag_name=="a":
            linksite = None
            if 'href' in chi.attrs.keys():
                linksite = chi.attrs['href']
            if linksite:
                linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
                if len(article) > 0 and article[-1]=='\n':
                    article += "["+chi.text+"]"+"("+linksite + ")"
                else:
                    article += "\n\n["+chi.text+"]"+"("+linksite + ")"
        elif tag_name=='b' or tag_name=='strong':
            if len(cll) > 1:
                article, number = parser_beautiful(chi, article, number, dircrea, True)
            else:
                txt = chi.text
                while len(txt) > 0 and txt[-1] == " ":
                    txt = txt[:-1]
                article += " **" + txt + "** "
        elif tag_name=="figure":
            noscript = chi.find_all('noscript')
            if len(noscript) > 0:
                chi.noscript.extract()
            imgchunk = chi.find_all('img')
            for i in range(len(imgchunk)):
                imglink = None
                if 'data-original' in imgchunk[i].attrs.keys():
                    imglink = imgchunk[i].attrs["data-original"]
                if 'data-actualsrc' in imgchunk[i].attrs.keys():
                    imglink = imgchunk[i].attrs['data-actualsrc']
                if imglink==None:
                    imglink = imgchunk[i].attrs["src"]
                try:
                    response = requests.get(imglink, timeout=30)
                except:
                    try:
                        response = requests.get(imglink, timeout=30)
                    except:
                        continue
                if response.status_code==200:
                    with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
                        obj.write(response.content)
                    files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
                    response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
                                               files=files,timeout=30)
                    if response1.status_code==200:
                        continue
                        # jsons = json.loads(response1.text)
                        # print(jsons)
                        # article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
                    requests.put(imglink, timeout=30)
                    number += 1
                    crawlsleep(1)
        elif tag_name=="div":
            prenode = chi.find_all('code')
            if len(prenode) > 0:
                for i in prenode:
                    article += "\n\n```\n" + i.text + "\n```\n\n"
            else:
                article, number = parser_beautiful(chi, article, number, dircrea, bk)
                article += "\n\n"
    if bk:
        article += "**"
    return article, number
 # Xpanx.com 专业网络爬虫程序定制，加微信 LiteMango（付费）
 # 设置webdriver路径，替换为你的webdriver路径
 abspath = os.path.abspath(__file__)
 driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
 service = Service(executable_path=driverpath)
 edge_options = EdgeOptions()
 #https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec
 edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
 edge_options.add_experimental_option('useAutomationExtension', False)
 edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
 edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹
 # 初始化webdriver
 driver = webdriver.Edge(options=edge_options, service = service)
 # 微信公众号文章的 URL
 url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect'
 # 打开页面
 driver.get(url)
 # 等待一定时间让页面加载完成
 time.sleep(5)
 # 获取页面的源代码
 html = driver.page_source
 # 解析 HTML
 soup = BeautifulSoup(html, 'html.parser')
 # 提取文章标题
 title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True)
 # 提取文章内容
 content = soup.find('div', {'id': 'js_content'})
 richtext = driver.find_element(By.TAG_NAME, "section")
 article = ''
 print(content)
 inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
 innerHTML = BeautifulSoup(inner, "html.parser")
 res,num = parser_beautiful(innerHTML,article,0,'d://',False)
 # 将 HTML 转换为 Markdown
 #markdown = md(str(content))
 # 将 Markdown 写入文件
 with open(f'{title}.md', 'w', encoding='utf-8') as f:
    f.write(res)
 # 关闭webdriver
 driver.quit()