master
zcy 2024-07-03 17:41:44 +08:00
parent ca37ac0626
commit b64f7eb271
5 changed files with 134 additions and 8 deletions

View File

@ -1,4 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
from .csdn import spider from .csdn import spider
from .csdn import onearticle
from .csdn import CSDN from .csdn import CSDN

View File

@ -1,13 +1,22 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
from random import randint
import os, re import os, re
import requests import requests
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
from .tomd import Tomd from .tomd import Tomd
import requests
import json
import re
import cloudscraper
import time
def replace_chinese(text, old_chinese, new_chinese):
# 使用正则表达式匹配中文字符
pattern = re.compile(re.escape(old_chinese), re.IGNORECASE)
return pattern.sub(new_chinese, text)
def result_file(folder_username, file_name, folder_name): def result_file(folder_username, file_name, folder_name):
folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", folder_name, folder_username) folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", folder_name, folder_username)
if not os.path.exists(folder): if not os.path.exists(folder):
@ -69,7 +78,10 @@ class CSDN(object):
while len(articles) > 0: while len(articles) > 0:
num += 1 num += 1
url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num) url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
response = self.s.get(url=url, headers=self.headers) scraper = cloudscraper.create_scraper() # returns a CloudScraper instance
response = scraper.get(url)
# response = self.s.get(url=url, headers=self.headers)
html = response.text html = response.text
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"}) articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
@ -79,10 +91,16 @@ class CSDN(object):
self.TaskQueue.append((article_title, article_href)) self.TaskQueue.append((article_title, article_href))
def get_md(self, url): def get_md(self, url):
response = self.s.get(url=url, headers=self.headers) scraper = cloudscraper.create_scraper() # returns a CloudScraper instance
html = response.text
# response = self.s.get(url=url, headers=self.headers)
html = scraper.get(url).text
soup = BeautifulSoup(html, 'lxml') soup = BeautifulSoup(html, 'lxml')
content = soup.select_one("#mainBox > main > div.blog-content-box") content = soup.select_one("#mainBox > main > div.blog-content-box")
if(content == None):
return ""
print(str(content))
# 删除注释 # 删除注释
for useless_tag in content(text=lambda text: isinstance(text, Comment)): for useless_tag in content(text=lambda text: isinstance(text, Comment)):
useless_tag.extract() useless_tag.extract()
@ -96,6 +114,7 @@ class CSDN(object):
eles_except = ["img", "br", "hr"] eles_except = ["img", "br", "hr"]
delete_blank_ele(content, eles_except) delete_blank_ele(content, eles_except)
# 转换为markdown # 转换为markdown
print(str(content))
md = Tomd(str(content)).markdown md = Tomd(str(content)).markdown
return md return md
@ -107,24 +126,54 @@ class CSDN(object):
with open(reademe_path,'w', encoding='utf-8') as reademe_file: with open(reademe_path,'w', encoding='utf-8') as reademe_file:
readme_head = "# " + self.username + " 的博文\n" readme_head = "# " + self.username + " 的博文\n"
reademe_file.write(readme_head) reademe_file.write(readme_head)
self.TaskQueue.reverse() self.TaskQueue.reverse()
for (article_title,article_href) in self.TaskQueue: for (article_title,article_href) in self.TaskQueue:
text = str(self.url_num) + '. [' + article_title + ']('+ article_href +')\n' text = str(self.url_num) + '. [' + article_title + ']('+ article_href +')\n'
reademe_file.write(text) reademe_file.write(text)
self.url_num += 1 self.url_num += 1
self.url_num = 1 self.url_num = 1
def get_all_articles(self): def get_all_articles(self):
while len(self.TaskQueue) > 0: while len(self.TaskQueue) > 0:
(article_title,article_href) = self.TaskQueue.pop() (article_title,article_href) = self.TaskQueue.pop()
time.sleep(randint(10, 25))
file_name = re.sub(r'[\/:*?"<>|\n]','-', article_title) + ".md" file_name = re.sub(r'[\/:*?"<>|\n]','-', article_title) + ".md"
artical_path = result_file(folder_username=self.username, file_name=file_name, folder_name=self.folder_name) artical_path = result_file(folder_username=self.username, file_name=file_name, folder_name=self.folder_name)
article_title = article_title.replace('\n',' ')
article_title = article_title.replace('"',' ')
article_title = article_title.replace('\'',' ')
article_title = article_title.replace('\r',' ')
article_title = article_title.replace('\t',' ')
md_head = "# " + article_title + "\n" md_head = "# " + article_title + "\n"
md = md_head + self.get_md(article_href)
md = self.get_md(article_href)
print(md)
while md == "":
time.sleep(randint(5, 25))
md = self.get_md(article_href)
md = '[引用自](www.csdn.net)\r\n ' + md_head + md
print("[++++] 正在处理URL{}".format(article_href)) print("[++++] 正在处理URL{}".format(article_href))
# https://www.testingcloud.club/sapi/api/article_tree
with open(artical_path, "w", encoding="utf-8") as artical_file: with open(artical_path, "w", encoding="utf-8") as artical_file:
artical_file.write(md) artical_file.write(md)
requests.put("https://www.testingcloud.club/sapi/api/article_tree",json.dumps({
"title": (article_title),
"content":(md),
"spend_time":1,
"father":2500,
"level":1,
"author":"sds",
"is_public":1,
"author":"admin"
}))
md = md
self.url_num += 1 self.url_num += 1
@ -138,3 +187,31 @@ def spider(username: str, cookie_path:str, folder_name: str = "blog"):
csdn.get_all_articles() csdn.get_all_articles()
def onearticle(href: str,cookie:str,folder_name: str = "blog"):
if not os.path.exists(folder_name):
os.makedirs(folder_name)
csdn = CSDN('username', folder_name, cookie)
md = csdn.get_md(href)
print(md)
while md == "":
time.sleep(randint(5, 25))
md = csdn.get_md(href)
print("[++++] 正在处理URL{}".format(href))
# https://www.testingcloud.club/sapi/api/article_tree
# with open(artical_path, "w", encoding="utf-8") as artical_file:
# artical_file.write(md)
# requests.put("https://www.testingcloud.club/sapi/api/article_tree",json.dumps({
# "title": (article_title),
# "content":(md),
# "spend_time":1,
# "father":2500,
# "level":1,
# "author":"sds",
# "is_public":1,
# "author":"admin"
# }))
# md = md
# self.url_num += 1

View File

@ -1,4 +1,8 @@
# coding: utf-8
import re import re
import requests
import json
__all__ = ['Tomd', 'convert'] __all__ = ['Tomd', 'convert']
@ -68,7 +72,8 @@ INLINE_ELEMENTS = {
'img': '<img.*?src="(.*?)".*?>(.*?)</img>', 'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
'a': '<a.*?href="(.*?)".*?>(.*?)</a>', 'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
'em': '<em.*?>(.*?)</em>', 'em': '<em.*?>(.*?)</em>',
'strong': '<strong.*?>(.*?)</strong>' 'strong': '<strong.*?>(.*?)</strong>',
'math':'<span><span><span>(.*?)</span></span></span>'
} }
DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>'] DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>']
@ -109,6 +114,8 @@ class Element:
self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
elif self.tag == 'tr' and tag == 'td': elif self.tag == 'tr' and tag == 'td':
self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
elif self.tag == 'math':
self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
else: else:
wrapper = MARKDOWN.get(tag) wrapper = MARKDOWN.get(tag)
self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content) self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
@ -139,6 +146,35 @@ class Tomd:
elements.append(element) elements.append(element)
elements.sort(key=lambda element: element.start_pos) elements.sort(key=lambda element: element.start_pos)
for e in elements:
if e.tag == 'p_with_out_class':
# print(e.content)
imgs = re.findall(r"<img[\s\S]*?>",str(e),re.S)
if imgs:
print("found 1",len(imgs))
print(imgs)
reg = """<img[^>]+src=["']([^'"<>]+)["'][^<>]+/?>"""
imgs2 = re.findall(reg,str(e))
print("found2",len(imgs))
i = 0
# https://www.testingcloud.club/sapi/api/download_pic
for img in imgs2:
print(img)
resp = requests.post("https://www.testingcloud.club/sapi/api/download_pic",json.dumps({
"url": (img),
}))
obj = json.loads(resp.text)
# print("http://127.0.0.1:4596/api/image_download/" + obj['url'])
# https://www.testingcloud.club/sapi/api/image_download/
e.content = str(e).replace(imgs[i],"![](https://www.testingcloud.club/sapi/api/image_download/" + obj['url'] + ")")
i = i + 1
# e.content = str(e).replace(img,"\r\n https://www.testingcloud.club/sapi/api/image_download/" + obj['url'])
self._markdown = ''.join([str(e) for e in elements]) self._markdown = ''.join([str(e) for e in elements])
for index, element in enumerate(DELETE_ELEMENTS): for index, element in enumerate(DELETE_ELEMENTS):

9
single_article.py Normal file
View File

@ -0,0 +1,9 @@
#!/usr/bin/env python
# coding: utf-8
import csdn
if __name__ == "__main__":
csdn.onearticle("https://blog.csdn.net/u010658002/article/details/124567286", "cookie.txt")

View File

@ -4,4 +4,6 @@
import csdn import csdn
if __name__ == "__main__": if __name__ == "__main__":
csdn.spider("ds19991999", "cookie.txt") csdn.spider("Poulen", "cookie.txt")