update csdn.py

master
ds19991999 2019-10-24 23:52:15 +08:00
parent ac7908cfc0
commit 9fdfb1c64d
1 changed files with 8 additions and 12 deletions

View File

@ -91,21 +91,19 @@ class CSDN(object):
def start(self): def start(self):
num = 0 num = 0
while True: articles = [None]
while len(articles) > 0:
num += 1 num += 1
url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num) url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
response = requests.get(url=url, headers=self.headers) response = requests.get(url=url, headers=self.headers)
html = response.text html = response.text
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"}) articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
if len(articles) > 0:
for article in articles: for article in articles:
article_title = article.a.text.strip().replace(' ','') article_title = article.a.text.strip().replace(' ','')
article_href = article.a['href'] article_href = article.a['href']
with ensure_memory(sys.getsizeof(self.TaskQueue.UnVisitedList)): with ensure_memory(sys.getsizeof(self.TaskQueue.UnVisitedList)):
self.TaskQueue.InsertUnVisitedList([article_title, article_href]) self.TaskQueue.InsertUnVisitedList([article_title, article_href])
else:
break
def get_md(self, url): def get_md(self, url):
response = requests.get(url=url, headers=self.headers) response = requests.get(url=url, headers=self.headers)
@ -162,9 +160,7 @@ class CSDN(object):
pass pass
def muti_spider(self, thread_num): def muti_spider(self, thread_num):
while True: while self.TaskQueue.getUnVisitedListLength() > 0:
if self.TaskQueue.getUnVisitedListLength() < 1:
break
thread_list = [] thread_list = []
for i in range(thread_num): for i in range(thread_num):
th = threading.Thread(target=self.get_all_articles) th = threading.Thread(target=self.get_all_articles)