modify parse htmlpage
parent
26c892386e
commit
37f7180e1b
|
@ -4,6 +4,8 @@
|
||||||
## 亮点
|
## 亮点
|
||||||
1、保存**回答**到**pdf**、**markdown**,并保存相应的图片、codes以及website,排版基本按照网页,**支持保存数学公式到markdown**,回答会保存提问和自己的回答<br>
|
1、保存**回答**到**pdf**、**markdown**,并保存相应的图片、codes以及website,排版基本按照网页,**支持保存数学公式到markdown**,回答会保存提问和自己的回答<br>
|
||||||
|
|
||||||
|
###### 20230729 使用BeautifulSoup库来进行解析网页,使用起来更加稳定的,之前是使用的selenium
|
||||||
|
|
||||||
2、保存**article**到**pdf**、**markdown**,并保存相应的图片、codes以及website,排版基本按照网页,**支持保存数学公式到markdown**<br>
|
2、保存**article**到**pdf**、**markdown**,并保存相应的图片、codes以及website,排版基本按照网页,**支持保存数学公式到markdown**<br>
|
||||||
|
|
||||||
3、保存**想法**到text并保存相应的图片,最后对所有text进行汇总到一个档案<br>
|
3、保存**想法**到text并保存相应的图片,最后对所有text进行汇总到一个档案<br>
|
||||||
|
|
171
crawler.py
171
crawler.py
|
@ -26,6 +26,7 @@ from selenium.webdriver.common.print_page_options import PrintOptions
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
import base64
|
import base64
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
abspath = os.path.abspath(__file__)
|
abspath = os.path.abspath(__file__)
|
||||||
filename = abspath.split(os.sep)[-1]
|
filename = abspath.split(os.sep)[-1]
|
||||||
|
@ -306,6 +307,107 @@ def cleartxt(kkk):
|
||||||
kkk = kkk.replace("\n", "")
|
kkk = kkk.replace("\n", "")
|
||||||
return kkk
|
return kkk
|
||||||
|
|
||||||
|
def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
|
||||||
|
if not innerHTML:
|
||||||
|
return article, number
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
|
if isinstance(innerHTML, str):
|
||||||
|
article += innerHTML.text
|
||||||
|
return article, number
|
||||||
|
|
||||||
|
for chi in innerHTML.children:
|
||||||
|
# article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
tag_name = chi.name
|
||||||
|
if isinstance(chi, str):
|
||||||
|
article += chi.text
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
cll = [c for c in chi.children]
|
||||||
|
if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
elif tag_name=="br":
|
||||||
|
article += "\n"
|
||||||
|
elif tag_name=="p":
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
article += "\n\n"
|
||||||
|
# elif tag_name=="br":
|
||||||
|
# article += "<br>\n"
|
||||||
|
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
article += '#' * int(tag_name[-1]) + ' '
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
article += '\n\n'
|
||||||
|
elif tag_name=="span":
|
||||||
|
datatex = None
|
||||||
|
classc = None
|
||||||
|
if 'data-tex' in chi.attrs.keys():
|
||||||
|
datatex = chi.attrs["data-tex"]
|
||||||
|
if 'class' in chi.attrs.keys():
|
||||||
|
classc = chi.attrs["class"]
|
||||||
|
if datatex and classc and 'ztext-math' in classc:
|
||||||
|
if article[-3-1:]=='<br>' or article[-1:]=='\n':
|
||||||
|
article += "\n$" + chi.attrs["data-tex"] + "$"
|
||||||
|
else:
|
||||||
|
article += "$" + chi.attrs["data-tex"] + "$"
|
||||||
|
else:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
# article += nod.text
|
||||||
|
elif tag_name=="a":
|
||||||
|
linksite = None
|
||||||
|
if 'href' in chi.attrs.keys():
|
||||||
|
linksite = chi.attrs['href']
|
||||||
|
if linksite:
|
||||||
|
linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
|
||||||
|
if article[-1]=='\n':
|
||||||
|
article += "["+chi.text+"]"+"("+linksite + ")"
|
||||||
|
else:
|
||||||
|
article += "\n\n["+chi.text+"]"+"("+linksite + ")"
|
||||||
|
elif tag_name=='b' or tag_name=='strong':
|
||||||
|
if len(cll) > 1:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, True)
|
||||||
|
else:
|
||||||
|
txt = chi.text
|
||||||
|
while len(txt) > 0 and txt[-1] == " ":
|
||||||
|
txt = txt[:-1]
|
||||||
|
article += " **" + txt + "** "
|
||||||
|
elif tag_name=="figure":
|
||||||
|
noscript = chi.find_all('noscript')
|
||||||
|
if len(noscript) > 0:
|
||||||
|
chi.noscript.extract()
|
||||||
|
imgchunk = chi.find_all('img')
|
||||||
|
for i in range(len(imgchunk)):
|
||||||
|
imglink = None
|
||||||
|
if 'data-original' in imgchunk[i].attrs.keys():
|
||||||
|
imglink = imgchunk[i].attrs["data-original"]
|
||||||
|
|
||||||
|
if 'data-actualsrc' in imgchunk[i].attrs.keys():
|
||||||
|
imglink = imgchunk[i].attrs['data-actualsrc']
|
||||||
|
|
||||||
|
if imglink==None:
|
||||||
|
imglink = imgchunk[i].attrs["src"]
|
||||||
|
try:
|
||||||
|
response = requests.get(imglink, timeout=30)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
response = requests.get(imglink, timeout=30)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
if response.status_code==200:
|
||||||
|
article += ''' <img src="%d.jpg" width="100%%"/> \n\n'''%number
|
||||||
|
# article += '''<img src="%d.jpg"/>'''%number
|
||||||
|
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
|
||||||
|
obj.write(response.content)
|
||||||
|
number += 1
|
||||||
|
crawlsleep(sleeptime)
|
||||||
|
elif tag_name=="div":
|
||||||
|
prenode = chi.find_all('code')
|
||||||
|
if len(prenode) > 0:
|
||||||
|
for i in prenode:
|
||||||
|
article += "\n\n```\n" + i.text + "\n```\n\n"
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
|
return article, number
|
||||||
|
|
||||||
def recursion(nod, article, number, driver, dircrea, bk=False):
|
def recursion(nod, article, number, driver, dircrea, bk=False):
|
||||||
if isinstance(nod, dict):
|
if isinstance(nod, dict):
|
||||||
if 'nodeName' in nod.keys() and nod['nodeName']=='#text':
|
if 'nodeName' in nod.keys() and nod['nodeName']=='#text':
|
||||||
|
@ -324,9 +426,12 @@ def recursion(nod, article, number, driver, dircrea, bk=False):
|
||||||
article += "<br>\n"
|
article += "<br>\n"
|
||||||
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
article += "\n" + '#' * int(tag_name[-1]) + ' '
|
article += "\n" + '#' * int(tag_name[-1]) + ' '
|
||||||
|
try:
|
||||||
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
||||||
for pnode in p_childNodes:
|
for pnode in p_childNodes:
|
||||||
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
article += '\n'
|
article += '\n'
|
||||||
elif tag_name=="span":
|
elif tag_name=="span":
|
||||||
datatex = nod.get_attribute("data-tex")
|
datatex = nod.get_attribute("data-tex")
|
||||||
|
@ -391,6 +496,7 @@ def recursion(nod, article, number, driver, dircrea, bk=False):
|
||||||
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
||||||
except:
|
except:
|
||||||
article += nod.text
|
article += nod.text
|
||||||
|
article += "\n"
|
||||||
elif tag_name=="div":
|
elif tag_name=="div":
|
||||||
# atags = nod.find_elements(By.TAG_NAME, 'a')
|
# atags = nod.find_elements(By.TAG_NAME, 'a')
|
||||||
prenode = nod.find_elements(By.TAG_NAME, 'code')
|
prenode = nod.find_elements(By.TAG_NAME, 'code')
|
||||||
|
@ -511,13 +617,19 @@ def crawl_article_detail(driver:webdriver):
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if MarkDown_FORMAT:
|
||||||
richtext = driver.find_element(By.CLASS_NAME, "Post-RichText")
|
richtext = driver.find_element(By.CLASS_NAME, "Post-RichText")
|
||||||
titletext = driver.find_element(By.CLASS_NAME, "Post-Title")
|
titletext = driver.find_element(By.CLASS_NAME, "Post-Title")
|
||||||
article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
|
# article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
|
||||||
article = ""
|
article = ""
|
||||||
number = 0
|
number = 0
|
||||||
for nod in article_childNodes:
|
|
||||||
article, number = recursion(nod, article, number, driver, dircrea)
|
# for nod in article_childNodes:
|
||||||
|
# article, number = recursion(nod, article, number, driver, dircrea)
|
||||||
|
|
||||||
|
inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
|
||||||
|
innerHTML = BeautifulSoup(inner, "html.parser")
|
||||||
|
article, number = parser_beautiful(innerHTML, article, number, dircrea)
|
||||||
|
|
||||||
article = article.replace("修改\n", "").replace("开启赞赏\n", "开启赞赏, ").replace("添加评论\n", "").replace("分享\n", "").\
|
article = article.replace("修改\n", "").replace("开启赞赏\n", "开启赞赏, ").replace("添加评论\n", "").replace("分享\n", "").\
|
||||||
replace("收藏\n", "").replace("设置\n", "")
|
replace("收藏\n", "").replace("设置\n", "")
|
||||||
|
@ -694,11 +806,16 @@ def crawl_answer_detail(driver:webdriver):
|
||||||
crawlsleep(max(2, sleeptime))
|
crawlsleep(max(2, sleeptime))
|
||||||
button.click()
|
button.click()
|
||||||
question_RichText = QuestionRichText.find_element(By.CLASS_NAME, "RichText")
|
question_RichText = QuestionRichText.find_element(By.CLASS_NAME, "RichText")
|
||||||
question_childNodes = driver.execute_script("return arguments[0].childNodes;", question_RichText)
|
# question_childNodes = driver.execute_script("return arguments[0].childNodes;", question_RichText)
|
||||||
|
|
||||||
article += "# question: <br>\n"
|
article += "# question: <br>\n"
|
||||||
for nod in question_childNodes:
|
# for nod in question_childNodes:
|
||||||
article, number = recursion(nod, article, number, driver, dircrea)
|
# article, number = recursion(nod, article, number, driver, dircrea)
|
||||||
|
|
||||||
|
inner = driver.execute_script("return arguments[0].innerHTML;", question_RichText)
|
||||||
|
innerHTML = BeautifulSoup(inner, "html.parser")
|
||||||
|
article, number = parser_beautiful(innerHTML, article, number, dircrea)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
article += "<br>\n\n\n# answer: <br>\n"
|
article += "<br>\n\n\n# answer: <br>\n"
|
||||||
|
@ -734,6 +851,8 @@ def crawl_answer_detail(driver:webdriver):
|
||||||
richtext = QuestionAnswer.find_element(By.CLASS_NAME, "CopyrightRichText-richText")
|
richtext = QuestionAnswer.find_element(By.CLASS_NAME, "CopyrightRichText-richText")
|
||||||
Createdtime = QuestionAnswer.find_element(By.CLASS_NAME, "ContentItem-time")
|
Createdtime = QuestionAnswer.find_element(By.CLASS_NAME, "ContentItem-time")
|
||||||
Created = Createdtime.text[4:].replace(" ", "_").replace(":", "_").replace(".", "_")
|
Created = Createdtime.text[4:].replace(" ", "_").replace(":", "_").replace(".", "_")
|
||||||
|
|
||||||
|
if MarkDown_FORMAT:
|
||||||
metatext = QuestionAnswer.find_elements(By.TAG_NAME, "meta")
|
metatext = QuestionAnswer.find_elements(By.TAG_NAME, "meta")
|
||||||
for i in range(len(metatext)):
|
for i in range(len(metatext)):
|
||||||
# if metatext[i].get_attribute("itemprop")=="dateCreated":
|
# if metatext[i].get_attribute("itemprop")=="dateCreated":
|
||||||
|
@ -741,9 +860,13 @@ def crawl_answer_detail(driver:webdriver):
|
||||||
if metatext[i].get_attribute("itemprop")=="dateModified":
|
if metatext[i].get_attribute("itemprop")=="dateModified":
|
||||||
Modified = metatext[i].get_attribute("content").replace(" ", "_").replace(":", "_").replace(".", "_")
|
Modified = metatext[i].get_attribute("content").replace(" ", "_").replace(":", "_").replace(".", "_")
|
||||||
|
|
||||||
answer_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
|
# answer_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
|
||||||
for nod in answer_childNodes:
|
# for nod in answer_childNodes:
|
||||||
article, number = recursion(nod, article, number, driver, dircrea)
|
# article, number = recursion(nod, article, number, driver, dircrea)
|
||||||
|
|
||||||
|
inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
|
||||||
|
innerHTML = BeautifulSoup(inner, "html.parser")
|
||||||
|
article, number = parser_beautiful(innerHTML, article, number, dircrea)
|
||||||
|
|
||||||
article = article.replace("修改\n", "").replace("开启赞赏\n", "开启赞赏, ").replace("添加评论\n", "").replace("分享\n", "").\
|
article = article.replace("修改\n", "").replace("开启赞赏\n", "开启赞赏, ").replace("添加评论\n", "").replace("分享\n", "").\
|
||||||
replace("收藏\n", "").replace("设置\n", "")
|
replace("收藏\n", "").replace("设置\n", "")
|
||||||
|
@ -903,6 +1026,7 @@ if __name__ == "__main__":
|
||||||
断了再次爬取的话,可以配置到--links_scratch,事先保存好website')
|
断了再次爬取的话,可以配置到--links_scratch,事先保存好website')
|
||||||
parser.add_argument('--article', action="store_true", help=r'crawl article, 是否爬取知乎的文章, 保存到pdf、markdown以及相关图片等,已经爬取过的不会重复爬取,\
|
parser.add_argument('--article', action="store_true", help=r'crawl article, 是否爬取知乎的文章, 保存到pdf、markdown以及相关图片等,已经爬取过的不会重复爬取,\
|
||||||
断了再次爬取的话,可以配置到--links_scratch,事先保存好website')
|
断了再次爬取的话,可以配置到--links_scratch,事先保存好website')
|
||||||
|
parser.add_argument('--MarkDown', action="store_true", help=r'save MarkDown')
|
||||||
parser.add_argument('--links_scratch', action="store_true", \
|
parser.add_argument('--links_scratch', action="store_true", \
|
||||||
help=r'crawl links scratch for answer or article, 是否使用已经保存好的website和title, 否则再次爬取website')
|
help=r'crawl links scratch for answer or article, 是否使用已经保存好的website和title, 否则再次爬取website')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -912,25 +1036,26 @@ if __name__ == "__main__":
|
||||||
crawl_article = args.article
|
crawl_article = args.article
|
||||||
crawl_links_scratch = args.links_scratch
|
crawl_links_scratch = args.links_scratch
|
||||||
addtime = args.computer_time_sleep
|
addtime = args.computer_time_sleep
|
||||||
|
MarkDown_FORMAT = args.MarkDown
|
||||||
|
|
||||||
# crawl_think = False
|
# crawl_think = False
|
||||||
# crawl_article = False
|
# crawl_article = False
|
||||||
# crawl_answer = True
|
# crawl_answer = True
|
||||||
# crawl_links_scratch = False
|
# crawl_links_scratch = False
|
||||||
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --think
|
# MarkDown_FORMAT = True
|
||||||
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --article
|
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --think --MarkDown
|
||||||
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --answer
|
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --article --MarkDown
|
||||||
|
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --answer --MarkDown
|
||||||
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --think --answer --article
|
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --think --answer --article
|
||||||
zhihu()
|
zhihu()
|
||||||
try:
|
# try:
|
||||||
crawl_links_scratch = False
|
# crawl_links_scratch = False
|
||||||
zhihu()
|
# zhihu()
|
||||||
except:
|
# except:
|
||||||
time.sleep(600)
|
# time.sleep(600)
|
||||||
try:
|
# try:
|
||||||
zhihu()
|
# zhihu()
|
||||||
except:
|
# except:
|
||||||
time.sleep(600)
|
# time.sleep(600)
|
||||||
zhihu()
|
# zhihu()
|
||||||
logfp.close()
|
logfp.close()
|
Loading…
Reference in New Issue