diff --git a/csdn/__init__.py b/csdn/__init__.py index 3308286..c77ffb6 100644 --- a/csdn/__init__.py +++ b/csdn/__init__.py @@ -1,4 +1,6 @@ #!/usr/bin/env python # coding: utf-8 from .csdn import spider +from .csdn import onearticle + from .csdn import CSDN \ No newline at end of file diff --git a/csdn/csdn.py b/csdn/csdn.py index a82ab34..1c01022 100644 --- a/csdn/csdn.py +++ b/csdn/csdn.py @@ -1,13 +1,22 @@ #!/usr/bin/env python # coding: utf-8 - +from random import randint import os, re import requests from bs4 import BeautifulSoup, Comment from .tomd import Tomd +import requests +import json +import re +import cloudscraper +import time - + +def replace_chinese(text, old_chinese, new_chinese): + # 使用正则表达式匹配中文字符 + pattern = re.compile(re.escape(old_chinese), re.IGNORECASE) + return pattern.sub(new_chinese, text) def result_file(folder_username, file_name, folder_name): folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", folder_name, folder_username) if not os.path.exists(folder): @@ -69,7 +78,10 @@ class CSDN(object): while len(articles) > 0: num += 1 url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num) - response = self.s.get(url=url, headers=self.headers) + scraper = cloudscraper.create_scraper() # returns a CloudScraper instance + response = scraper.get(url) + + # response = self.s.get(url=url, headers=self.headers) html = response.text soup = BeautifulSoup(html, "html.parser") articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"}) @@ -79,10 +91,16 @@ class CSDN(object): self.TaskQueue.append((article_title, article_href)) def get_md(self, url): - response = self.s.get(url=url, headers=self.headers) - html = response.text + scraper = cloudscraper.create_scraper() # returns a CloudScraper instance + + # response = self.s.get(url=url, headers=self.headers) + html = scraper.get(url).text + soup = BeautifulSoup(html, 'lxml') content = soup.select_one("#mainBox > main > div.blog-content-box") + if(content == None): + return "" + print(str(content)) # 删除注释 for useless_tag in content(text=lambda text: isinstance(text, Comment)): useless_tag.extract() @@ -96,6 +114,7 @@ class CSDN(object): eles_except = ["img", "br", "hr"] delete_blank_ele(content, eles_except) # 转换为markdown + print(str(content)) md = Tomd(str(content)).markdown return md @@ -107,24 +126,54 @@ class CSDN(object): with open(reademe_path,'w', encoding='utf-8') as reademe_file: readme_head = "# " + self.username + " 的博文\n" reademe_file.write(readme_head) + self.TaskQueue.reverse() for (article_title,article_href) in self.TaskQueue: text = str(self.url_num) + '. [' + article_title + ']('+ article_href +')\n' reademe_file.write(text) + self.url_num += 1 self.url_num = 1 + + def get_all_articles(self): while len(self.TaskQueue) > 0: (article_title,article_href) = self.TaskQueue.pop() + time.sleep(randint(10, 25)) + file_name = re.sub(r'[\/::*?"<>|\n]','-', article_title) + ".md" artical_path = result_file(folder_username=self.username, file_name=file_name, folder_name=self.folder_name) + article_title = article_title.replace('\n',' ') + article_title = article_title.replace('"',' ') + article_title = article_title.replace('\'',' ') + article_title = article_title.replace('\r',' ') + article_title = article_title.replace('\t',' ') md_head = "# " + article_title + "\n" - md = md_head + self.get_md(article_href) + + md = self.get_md(article_href) + print(md) + while md == "": + time.sleep(randint(5, 25)) + md = self.get_md(article_href) + + md = '[引用自](www.csdn.net)\r\n ' + md_head + md print("[++++] 正在处理URL:{}".format(article_href)) + # https://www.testingcloud.club/sapi/api/article_tree with open(artical_path, "w", encoding="utf-8") as artical_file: artical_file.write(md) + requests.put("https://www.testingcloud.club/sapi/api/article_tree",json.dumps({ + "title": (article_title), + "content":(md), + "spend_time":1, + "father":2500, + "level":1, + "author":"sds", + "is_public":1, + "author":"admin" + })) + md = md self.url_num += 1 @@ -138,3 +187,31 @@ def spider(username: str, cookie_path:str, folder_name: str = "blog"): csdn.get_all_articles() +def onearticle(href: str,cookie:str,folder_name: str = "blog"): + if not os.path.exists(folder_name): + os.makedirs(folder_name) + csdn = CSDN('username', folder_name, cookie) + md = csdn.get_md(href) + print(md) + while md == "": + time.sleep(randint(5, 25)) + md = csdn.get_md(href) + + print("[++++] 正在处理URL:{}".format(href)) + # https://www.testingcloud.club/sapi/api/article_tree + # with open(artical_path, "w", encoding="utf-8") as artical_file: + # artical_file.write(md) + # requests.put("https://www.testingcloud.club/sapi/api/article_tree",json.dumps({ + # "title": (article_title), + # "content":(md), + # "spend_time":1, + # "father":2500, + # "level":1, + # "author":"sds", + # "is_public":1, + # "author":"admin" + # })) + # md = md + # self.url_num += 1 + + diff --git a/csdn/tomd.py b/csdn/tomd.py index db4f893..37ffc7b 100644 --- a/csdn/tomd.py +++ b/csdn/tomd.py @@ -1,4 +1,8 @@ +# coding: utf-8 + import re +import requests +import json __all__ = ['Tomd', 'convert'] @@ -68,7 +72,8 @@ INLINE_ELEMENTS = { 'img': '(.*?)', 'a': '(.*?)', 'em': '(.*?)', - 'strong': '(.*?)' + 'strong': '(.*?)', + 'math':'(.*?)' } DELETE_ELEMENTS = ['', '', '', ''] @@ -109,6 +114,8 @@ class Element: self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) elif self.tag == 'tr' and tag == 'td': self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) + elif self.tag == 'math': + self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content) else: wrapper = MARKDOWN.get(tag) self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content) @@ -139,6 +146,35 @@ class Tomd: elements.append(element) elements.sort(key=lambda element: element.start_pos) + for e in elements: + if e.tag == 'p_with_out_class': + # print(e.content) + imgs = re.findall(r"",str(e),re.S) + if imgs: + print("found 1",len(imgs)) + print(imgs) + reg = """]+src=["']([^'"<>]+)["'][^<>]+/?>""" + imgs2 = re.findall(reg,str(e)) + print("found2",len(imgs)) + i = 0 + # https://www.testingcloud.club/sapi/api/download_pic + for img in imgs2: + print(img) + resp = requests.post("https://www.testingcloud.club/sapi/api/download_pic",json.dumps({ + "url": (img), + })) + obj = json.loads(resp.text) + # print("http://127.0.0.1:4596/api/image_download/" + obj['url']) + # https://www.testingcloud.club/sapi/api/image_download/ + + e.content = str(e).replace(imgs[i],"![](https://www.testingcloud.club/sapi/api/image_download/" + obj['url'] + ")") + i = i + 1 + + # e.content = str(e).replace(img,"\r\n https://www.testingcloud.club/sapi/api/image_download/" + obj['url']) + + + + self._markdown = ''.join([str(e) for e in elements]) for index, element in enumerate(DELETE_ELEMENTS): diff --git a/single_article.py b/single_article.py new file mode 100644 index 0000000..606858e --- /dev/null +++ b/single_article.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# coding: utf-8 + +import csdn + +if __name__ == "__main__": + csdn.onearticle("https://blog.csdn.net/u010658002/article/details/124567286", "cookie.txt") + + \ No newline at end of file diff --git a/test.py b/user.py similarity index 61% rename from test.py rename to user.py index c5166b0..1d5dc20 100644 --- a/test.py +++ b/user.py @@ -4,4 +4,6 @@ import csdn if __name__ == "__main__": - csdn.spider("ds19991999", "cookie.txt") + csdn.spider("Poulen", "cookie.txt") + + \ No newline at end of file