From cd65a4ec596182b20aea7c8ce2e8c2d384042855 Mon Sep 17 00:00:00 2001 From: ZouJiu <1069679911@qq.com> Date: Fri, 28 Jul 2023 20:33:12 +0800 Subject: [PATCH] paragraph crawl --- crawler.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/crawler.py b/crawler.py index fe89878..bb3e77a 100644 --- a/crawler.py +++ b/crawler.py @@ -383,10 +383,12 @@ def recursion(nod, article, number, driver, dircrea, bk=False): for pnode in p_childNodes: article, number = recursion(pnode, article, number, driver, dircrea, bk) elif tag_name=='p': - p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod) - for pnode in p_childNodes: - article, number = recursion(pnode, article, number, driver, dircrea, bk) - article += "\n" + try: + p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod) + for pnode in p_childNodes: + article, number = recursion(pnode, article, number, driver, dircrea, bk) + except: + article += nod.text elif tag_name=="div": # atags = nod.find_elements(By.TAG_NAME, 'a') prenode = nod.find_elements(By.TAG_NAME, 'code')