htmlparse

master
ZouJiu 2023-07-23 13:32:37 +08:00
parent 032bf59d82
commit 599f5c2f0a
1 changed files with 25 additions and 10 deletions

View File

@ -306,12 +306,16 @@ def cleartxt(kkk):
kkk = kkk.replace("\n", "")
return kkk
def recursion(nod, article, number, driver, dircrea):
def recursion(nod, article, number, driver, dircrea, bk=False):
if isinstance(nod, dict):
if 'nodeName' in nod.keys() and nod['nodeName']=='#text':
kkk = cleartxt(nod['textContent'])
if len(kkk) > 0:
if bk:
article += "**"
article += nod['textContent']
if bk:
article += "**"
return article, number
elif isinstance(nod, webdriver.remote.webelement.WebElement):
@ -322,7 +326,7 @@ def recursion(nod, article, number, driver, dircrea):
article += "\n" + '#' * int(tag_name[-1]) + ' '
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea)
article, number = recursion(pnode, article, number, driver, dircrea, bk)
article += '\n'
elif tag_name=="span":
datatex = nod.get_attribute("data-tex")
@ -336,11 +340,15 @@ def recursion(nod, article, number, driver, dircrea):
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
achunk = nod.find_elements(By.TAG_NAME, 'a')
if len(imgchunk)==0 and len(achunk)==0:
if bk:
article += "**"
article += nod.text
if bk:
article += "**"
else:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea)
article, number = recursion(pnode, article, number, driver, dircrea, bk)
# else:
# formula_span = nod.find_elements(By.CLASS_NAME, "ztext-math")
# for jf in range(len(formula_span)):
@ -355,22 +363,29 @@ def recursion(nod, article, number, driver, dircrea):
else:
article += "["+nod.text+"]"+"("+linksite + ")"
elif tag_name=="b" or tag_name=="strong":
txt = nod.text
while len(txt) > 0 and txt[-1] == " ":
txt = txt[:-1]
article += " **" + txt + "** "
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea, True)
# txt = nod.text
# while len(txt) > 0 and txt[-1] == " ":
# txt = txt[:-1]
# article += " **" + txt + "** "
elif tag_name=="em":
if bk:
article += "**"
article += nod.text
if bk:
article += "**"
# elif tag_name=='td':
# article += nod.text
elif tag_name in ['table', 'tbody', 'tr', 'td', 'u']:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea)
article, number = recursion(pnode, article, number, driver, dircrea, bk)
elif tag_name=='p':
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea)
article, number = recursion(pnode, article, number, driver, dircrea, bk)
article += "\n"
elif tag_name=="div":
# atags = nod.find_elements(By.TAG_NAME, 'a')
@ -381,7 +396,7 @@ def recursion(nod, article, number, driver, dircrea):
else:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea)
article, number = recursion(pnode, article, number, driver, dircrea, bk)
elif tag_name=="figure":
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
for i in range(len(imgchunk)):