from selenium import webdriver from bs4 import BeautifulSoup from markdownify import markdownify as md import time import os from selenium.webdriver.edge.service import Service from selenium.webdriver import EdgeOptions import requests import json from selenium.webdriver.common.by import By def crawlsleep(times): time.sleep(times) def parser_beautiful(innerHTML, article, number, dircrea, bk=False): if not innerHTML: return article, number if bk: article += "**" if isinstance(innerHTML, str): article += innerHTML.text return article, number for chi in innerHTML.children: # article, number = parser_beautiful(chi, article, number, dircrea, bk) tag_name = chi.name if isinstance(chi, str): article += chi.text continue else: cll = [c for c in chi.children] if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']: article, number = parser_beautiful(chi, article, number, dircrea, bk) elif tag_name=="br": article += "\n" elif tag_name=="p": article, number = parser_beautiful(chi, article, number, dircrea, bk) article += "\n\n" # elif tag_name=="br": # article += "
\n" elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: article += '#' * int(tag_name[-1]) + ' ' article, number = parser_beautiful(chi, article, number, dircrea, bk) article += '\n\n' elif tag_name=="span": datatex = None classc = None if 'data-tex' in chi.attrs.keys(): datatex = chi.attrs["data-tex"] if 'class' in chi.attrs.keys(): classc = chi.attrs["class"] if datatex and classc and 'ztext-math' in classc: content = chi.attrs["data-tex"] while len(content) > 0 and ' '==content[0]: content = content[1:] while len(content) > 0 and ' '==content[-1]: content = content[:-1] if len(content) > 0: if article[-3-1:]=='
' or article[-1:]=='\n': article += "\n$" + content + "$" else: article += "$" + content + "$" else: article, number = parser_beautiful(chi, article, number, dircrea, bk) # article += nod.text elif tag_name=="a": linksite = None if 'href' in chi.attrs.keys(): linksite = chi.attrs['href'] if linksite: linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "") if len(article) > 0 and article[-1]=='\n': article += "["+chi.text+"]"+"("+linksite + ")" else: article += "\n\n["+chi.text+"]"+"("+linksite + ")" elif tag_name=='b' or tag_name=='strong': if len(cll) > 1: article, number = parser_beautiful(chi, article, number, dircrea, True) else: txt = chi.text while len(txt) > 0 and txt[-1] == " ": txt = txt[:-1] article += " **" + txt + "** " elif tag_name=="figure": noscript = chi.find_all('noscript') if len(noscript) > 0: chi.noscript.extract() imgchunk = chi.find_all('img') for i in range(len(imgchunk)): imglink = None if 'data-original' in imgchunk[i].attrs.keys(): imglink = imgchunk[i].attrs["data-original"] if 'data-actualsrc' in imgchunk[i].attrs.keys(): imglink = imgchunk[i].attrs['data-actualsrc'] if imglink==None: imglink = imgchunk[i].attrs["src"] imglink = imglink.replace("&tp=webp","") try: response = requests.get(imglink, timeout=30) except: try: imglink = imglink.replace("&tp=webp","") response = requests.get(imglink, timeout=30) except: continue if response.status_code==200: with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj: obj.write(response.content) files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')} response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload', files=files,timeout=30) if response1.status_code==200: continue jsons = json.loads(response1.text) # print(jsons) article += ''' \n\n'''%jsons['url'] requests.put(imglink, timeout=30) number += 1 crawlsleep(1) elif tag_name=="div": prenode = chi.find_all('code') if len(prenode) > 0: for i in prenode: article += "\n\n```\n" + i.text + "\n```\n\n" else: article, number = parser_beautiful(chi, article, number, dircrea, bk) article += "\n\n" if bk: article += "**" return article, number # Xpanx.com 专业网络爬虫程序定制,加微信 LiteMango(付费) # 设置webdriver路径,替换为你的webdriver路径 abspath = os.path.abspath(__file__) driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe') service = Service(executable_path=driverpath) edge_options = EdgeOptions() #https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec edge_options.add_experimental_option('excludeSwitches', ['enable-automation']) edge_options.add_experimental_option('useAutomationExtension', False) edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en') edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹 # 初始化webdriver driver = webdriver.Edge(options=edge_options, service = service) # 微信公众号文章的 URL url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect' # 打开页面 driver.get(url) # 等待一定时间让页面加载完成 time.sleep(5) # 获取页面的源代码 html = driver.page_source # 解析 HTML soup = BeautifulSoup(html, 'html.parser') # 提取文章标题 title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True) # 提取文章内容 content = soup.find('div', {'id': 'js_content'}) richtext = driver.find_element(By.TAG_NAME, "section") article = '' print(content) inner = driver.execute_script("return arguments[0].innerHTML;", richtext) innerHTML = BeautifulSoup(inner, "html.parser") res,num = parser_beautiful(innerHTML,article,0,'d://',False) # 将 HTML 转换为 Markdown #markdown = md(str(content)) # 将 Markdown 写入文件 with open(f'{title}.md', 'w', encoding='utf-8') as f: f.write(res) # 关闭webdriver driver.quit()