update csdn.py
parent
ac7908cfc0
commit
9fdfb1c64d
20
CSDN/csdn.py
20
CSDN/csdn.py
|
@ -91,21 +91,19 @@ class CSDN(object):
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
num = 0
|
num = 0
|
||||||
while True:
|
articles = [None]
|
||||||
|
while len(articles) > 0:
|
||||||
num += 1
|
num += 1
|
||||||
url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
|
url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
|
||||||
response = requests.get(url=url, headers=self.headers)
|
response = requests.get(url=url, headers=self.headers)
|
||||||
html = response.text
|
html = response.text
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
|
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
|
||||||
if len(articles) > 0:
|
for article in articles:
|
||||||
for article in articles:
|
article_title = article.a.text.strip().replace(' ',':')
|
||||||
article_title = article.a.text.strip().replace(' ',':')
|
article_href = article.a['href']
|
||||||
article_href = article.a['href']
|
with ensure_memory(sys.getsizeof(self.TaskQueue.UnVisitedList)):
|
||||||
with ensure_memory(sys.getsizeof(self.TaskQueue.UnVisitedList)):
|
self.TaskQueue.InsertUnVisitedList([article_title, article_href])
|
||||||
self.TaskQueue.InsertUnVisitedList([article_title, article_href])
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
def get_md(self, url):
|
def get_md(self, url):
|
||||||
response = requests.get(url=url, headers=self.headers)
|
response = requests.get(url=url, headers=self.headers)
|
||||||
|
@ -162,9 +160,7 @@ class CSDN(object):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def muti_spider(self, thread_num):
|
def muti_spider(self, thread_num):
|
||||||
while True:
|
while self.TaskQueue.getUnVisitedListLength() > 0:
|
||||||
if self.TaskQueue.getUnVisitedListLength() < 1:
|
|
||||||
break
|
|
||||||
thread_list = []
|
thread_list = []
|
||||||
for i in range(thread_num):
|
for i in range(thread_num):
|
||||||
th = threading.Thread(target=self.get_all_articles)
|
th = threading.Thread(target=self.get_all_articles)
|
||||||
|
|
Loading…
Reference in New Issue