csdn_spider/csdn/Untitled-1.txt

21 lines
797 B
Plaintext
Raw Permalink Normal View History

2024-08-28 07:23:08 +00:00
def start3(self):
num = 0
articles = [None]
while len(articles) > 0:
num += 1
url = u'https://xuesong.blog.csdn.net/category_8454447_' + str(num) + '.html'
print(url)
# url = u'https://' + self.username + u'.blog.csdn.net/article/list/' + str(num)
scraper = cloudscraper.create_scraper() # returns a CloudScraper instance
response = scraper.get(url,headers=self.headers)
# response = self.s.get(url=url, headers=self.headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all('div', attrs={"class":"column_article_title"})
for article in articles:
article_title = article.a.text.strip().replace(' ','')
article_href = article.a['href']
self.TaskQueue.append((article_title, article_href))