add muti-user

master
ds19991999 2019-10-24 23:17:40 +08:00
parent 521ae25c1b
commit ac7908cfc0
4 changed files with 64 additions and 36 deletions

View File

@ -1,2 +1,4 @@
from .csdn import run #!/usr/bin/env python
# coding: utf-8
from .csdn import spider
from .csdn import CSDN from .csdn import CSDN

View File

@ -1,15 +1,18 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
import os, time, re import os, time, re
import contextlib
import sys
import requests import requests
import threading import threading
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
from .tomd import Tomd from .tomd import Tomd
def result_file(folder_name, file_name, article_name): def result_file(folder_username, file_name, folder_name):
folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../"+article_name, folder_name) folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", folder_name, folder_username)
if not os.path.exists(folder): if not os.path.exists(folder):
os.makedirs(folder) os.makedirs(folder)
path = os.path.join(folder, file_name) path = os.path.join(folder, file_name)
@ -77,17 +80,16 @@ class TaskQueue(object):
class CSDN(object): class CSDN(object):
def __init__(self, username, article__folder_name): def __init__(self, username, folder_name):
self.headers = { self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
} }
self.username = username self.username = username
self.TaskQueue = TaskQueue() self.TaskQueue = TaskQueue()
self.article__folder_name = article__folder_name self.folder_name = folder_name
self.url_num = 1 self.url_num = 1
def start(self): def start(self):
"""获取文章标题和链接"""
num = 0 num = 0
while True: while True:
num += 1 num += 1
@ -98,14 +100,14 @@ class CSDN(object):
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"}) articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
if len(articles) > 0: if len(articles) > 0:
for article in articles: for article in articles:
article_title = article.a.text.strip().replace(' ',': ') article_title = article.a.text.strip().replace(' ','')
article_href = article.a['href'] article_href = article.a['href']
self.TaskQueue.InsertUnVisitedList([article_title, article_href]) with ensure_memory(sys.getsizeof(self.TaskQueue.UnVisitedList)):
self.TaskQueue.InsertUnVisitedList([article_title, article_href])
else: else:
break break
def get_md(self, url): def get_md(self, url):
"""爬取文章"""
response = requests.get(url=url, headers=self.headers) response = requests.get(url=url, headers=self.headers)
html = response.text html = response.text
soup = BeautifulSoup(html, 'lxml') soup = BeautifulSoup(html, 'lxml')
@ -128,9 +130,10 @@ class CSDN(object):
def write_readme(self): def write_readme(self):
"""生成readme""" print("+"*100)
print("[++] 正在爬取 {} 的博文 ......".format(self.username)) print("[++] 开始爬取 {} 的博文 ......".format(self.username))
reademe_path = result_file(self.username,file_name="README.md",article_name=self.article__folder_name) print("+"*100)
reademe_path = result_file(self.username,file_name="README.md",folder_name=self.folder_name)
with open(reademe_path,'w', encoding='utf-8') as reademe_file: with open(reademe_path,'w', encoding='utf-8') as reademe_file:
readme_head = "# " + self.username + " 的博文\n" readme_head = "# " + self.username + " 的博文\n"
reademe_file.write(readme_head) reademe_file.write(readme_head)
@ -140,17 +143,16 @@ class CSDN(object):
self.url_num += 1 self.url_num += 1
self.url_num = 1 self.url_num = 1
def spider(self): def get_all_articles(self):
"""爬取所有文章"""
try: try:
while True: while True:
[article_title,article_href] = self.TaskQueue.PopUnVisitedList() [article_title,article_href] = self.TaskQueue.PopUnVisitedList()
try: try:
print("[++++] 正在处理URL{}".format(article_href))
file_name = re.sub(r'[\/:*?"<>|]','-', article_title) + ".md" file_name = re.sub(r'[\/:*?"<>|]','-', article_title) + ".md"
artical_path = result_file(folder_name=self.username, file_name=file_name, article_name=self.article__folder_name) artical_path = result_file(folder_username=self.username, file_name=file_name, folder_name=self.folder_name)
md_head = "# " + article_title + "\n" md_head = "# " + article_title + "\n"
md = md_head + self.get_md(article_href) md = md_head + self.get_md(article_href)
print("[++++] 正在处理URL{}".format(article_href))
with open(artical_path, "w", encoding="utf-8") as artical_file: with open(artical_path, "w", encoding="utf-8") as artical_file:
artical_file.write(md) artical_file.write(md)
except Exception: except Exception:
@ -165,22 +167,48 @@ class CSDN(object):
break break
thread_list = [] thread_list = []
for i in range(thread_num): for i in range(thread_num):
th = threading.Thread(target=self.spider) th = threading.Thread(target=self.get_all_articles)
thread_list.append(th) thread_list.append(th)
for th in thread_list: for th in thread_list:
th.start() th.start()
lock = threading.Lock()
total_mem= 1024 * 1024 * 500 #500MB spare memory
@contextlib.contextmanager
def ensure_memory(size):
global total_mem
while 1:
with lock:
if total_mem > size:
total_mem-= size
break
time.sleep(5)
yield
with lock:
total_mem += size
def run(username: str = "ds19991999", thread_num: int = 10, article__folder_name: str = "articles"):
if not os.path.exists(article__folder_name): def spider_user(username: str, thread_num: int = 10, folder_name: str = "articles"):
os.makedirs(article__folder_name) if not os.path.exists(folder_name):
csdn = CSDN(username,article__folder_name) os.makedirs(folder_name)
csdn = CSDN(username,folder_name)
csdn.start() csdn.start()
csdn.write_readme() th1 = threading.Thread(target=csdn.write_readme)
csdn.muti_spider(thread_num) th1.start()
th2 = threading.Thread(target=csdn.muti_spider, args=(thread_num,))
th2.start()
def spider(usernames: list, thread_num: int = 10, folder_name: str = "articles"):
for username in usernames:
try:
user_thread = threading.Thread(target=spider_user,args=(username, thread_num, folder_name))
user_thread.start()
print("[++] 开启爬取 {} 博文进程成功 ......".format(username))
except Exception:
print("[--] 开启爬取 {} 博文进程出现异常 ......".format(username))
if __name__ == "__main__": if __name__ == "__main__":
run("ds19991999", 10, "articles") spider(["ds19991999"])

View File

@ -11,12 +11,9 @@ python3 -m pip install -r requirements.txt
## 爬取用户全部博文 ## 爬取用户全部博文
```python ```python
#!/usr/bin/env python import csdn
# coding: utf-8 csdn.spider(["ds19991999", "u013088062"], 5)
# 参数 usernames: list, thread_num: int = 10, folder_name: str = "articles"
if __name__ == "__main__":
import CSDN
CSDN.run("ds19991999")
``` ```
## LICENSE ## LICENSE

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
import csdn
if __name__ == "__main__": if __name__ == "__main__":
import CSDN csdn.spider(["ds19991999", "u013088062"], 5)
CSDN.run("ds19991999")