csdn_spider/csdn/tomd.py

# coding: utf-8

import re
import requests
import json

__all__ = ['Tomd', 'convert']

MARKDOWN = {
    'h1': ('\n# ', '\n'),
    'h2': ('\n## ', '\n'),
    'h3': ('\n### ', '\n'),
    'h4': ('\n#### ', '\n'),
    'h5': ('\n##### ', '\n'),
    'h6': ('\n###### ', '\n'),
    'code': ('`', '`'),
    'ul': ('', ''),
    'ol': ('', ''),
    'li': ('- ', ''),
    'blockquote': ('\n> ', '\n'),
    'em': ('**', '**'),
    'strong': ('**', '**'),
    'block_code': ('\n```\n', '\n```\n'),
    'span': ('', ''),
    'p': ('\n', '\n'),
    'p_with_out_class': ('\n', '\n'),
    'inline_p': ('', ''),
    'inline_p_with_out_class': ('', ''),
    'b': ('**', '**'),
    'i': ('*', '*'),
    'del': ('~~', '~~'),
    'hr': ('\n---', '\n\n'),
    'thead': ('\n', '|------\n'),
    'tbody': ('\n', '\n'),
    'td': ('|', ''),
    'th': ('|', ''),
    'tr': ('', '\n')
}

BlOCK_ELEMENTS = {
    'h1': '<h1.*?>(.*?)</h1>',
    'h2': '<h2.*?>(.*?)</h2>',
    'h3': '<h3.*?>(.*?)</h3>',
    'h4': '<h4.*?>(.*?)</h4>',
    'h5': '<h5.*?>(.*?)</h5>',
    'h6': '<h6.*?>(.*?)</h6>',
    'hr': '<hr/>',
    'blockquote': '<blockquote.*?>(.*?)</blockquote>',
    'ul': '<ul.*?>(.*?)</ul>',
    'ol': '<ol.*?>(.*?)</ol>',
    'block_code': '<pre.*?><code.*?>(.*?)</code></pre>',
    'p': '<p\s.*?>(.*?)</p>',
    'p_with_out_class': '<p>(.*?)</p>',
    'thead': '<thead.*?>(.*?)</thead>',
    'tr': '<tr>(.*?)</tr>',
}

INLINE_ELEMENTS = {
    'td': '<td>(.*?)</td>',
    'tr': '<tr>(.*?)</tr>',
    'th': '<th>(.*?)</th>',
    'b': '<b>(.*?)</b>',
    'i': '<i>(.*?)</i>',
    'del': '<del>(.*?)</del>',
    'inline_p': '<p\s.*?>(.*?)</p>',
    'inline_p_with_out_class': '<p>(.*?)</p>',
    'code': '<code.*?>(.*?)</code>',
    'span': '<span.*?>(.*?)</span>',
    'ul': '<ul.*?>(.*?)</ul>',
    'ol': '<ol.*?>(.*?)</ol>',
    'li': '<li.*?>(.*?)</li>',
    'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
    'em': '<em.*?>(.*?)</em>',
    'strong': '<strong.*?>(.*?)</strong>',
    'math':'<span><span><span>(.*?)</span></span></span>',
    'img': '<img.*?src="(.*?)".*?/>',
}

DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>']


class Element:
    def __init__(self, start_pos, end_pos, content, tag, is_block=False):
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.content = content
        self._elements = []
        self.is_block = is_block
        self.tag = tag
        self._result = None

        print(str(content))
        if self.is_block:
            self.parse_inline()

    def __str__(self):
        wrapper = MARKDOWN.get(self.tag)
        self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])
        return self._result

    def parse_inline(self):
        for tag, pattern in INLINE_ELEMENTS.items():
            if tag == 'a':
                self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
            elif tag == 'img':
                self.content = re.sub(pattern, '![](\g<1>)', self.content)
                self.content = self.content
            elif self.tag == 'ul' and tag == 'li':
                self.content = re.sub(pattern, '- \g<1>', self.content)
            elif self.tag == 'ol' and tag == 'li':
                self.content = re.sub(pattern, '1. \g<1>', self.content)
            elif self.tag == 'thead' and tag == 'tr':
                self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'th':
                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'td':
                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
            elif self.tag == 'math':
                self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
            else:
                wrapper  = MARKDOWN.get(tag)
                if wrapper != None:
                    self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)


class Tomd:
    def __init__(self, html='', options=None):
        self.html = html
        self.options = options
        self._markdown = ''

    def convert(self, html, options=None):
        elements = []
        for tag, pattern in BlOCK_ELEMENTS.items():
            for m in re.finditer(pattern, html, re.I | re.S | re.M):
                element = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
                                  tag=tag,
                                  is_block=True)
                can_append = True
                for e in elements:
                    if e.start_pos < m.start() and e.end_pos > m.end():
                        can_append = False
                    elif e.start_pos > m.start() and e.end_pos < m.end():
                        elements.remove(e)
                if can_append:
                    elements.append(element)

        elements.sort(key=lambda element: element.start_pos)
        for e in elements:
            print(e.content)
            if(str(e).find('<img') != -1) :
                print(str(e))
                imgs = re.findall(r"<img[\s\S]*?>",str(e),re.S)
                if imgs:
                    print("found 1",len(imgs))
                    print(imgs)
                reg = """<img[^>]+src=["']([^'"<>]+)["'][^<>]+/?>"""
                imgs2 = re.findall(reg,str(e))
                print("found2",len(imgs))
                i = 0
                # https://www.testingcloud.club/sapi/api/download_pic
                for img in imgs2:
                    print(img)
                    resp = requests.post("https://www.testingcloud.club/sapi/api/download_pic",json.dumps({
                        "url": (img),
                    }))
                    obj = json.loads(resp.text)
                    # print("http://127.0.0.1:4596/api/image_download/"  + obj['url'])
                    # https://www.testingcloud.club/sapi/api/image_download/

                    e.content = str(e).replace(imgs[i],"![](https://www.testingcloud.club/sapi/api/image_download/"  + obj['url'] + ")")
                    i = i + 1

                    # e.content = str(e).replace(img,"\r\n https://www.testingcloud.club/sapi/api/image_download/"  + obj['url'])
                    

        self._markdown = ''.join([str(e) for e in elements])

        for index, element in enumerate(DELETE_ELEMENTS):
            self._markdown = re.sub(element, '', self._markdown)
        return self._markdown

    @property
    def markdown(self):
        self.convert(self.html, self.options)
        print(self._markdown)
        imgori = re.findall('(?:!\[.*?\]\((.*?)\))',self._markdown)
        i = 0
        for img in imgori:
            print(img)
            resp = requests.post("https://www.testingcloud.club/sapi/api/download_pic",json.dumps({
                "url": (img),
            }))
            obj = json.loads(resp.text)
            # print("http://127.0.0.1:4596/api/image_download/"  + obj['url'])
            # https://www.testingcloud.club/sapi/api/image_download/

            self._markdown = self._markdown.replace(imgori[i],"https://www.testingcloud.club/sapi/api/image_download/"  + obj['url'])
            i = i + 1

        return self._markdown


_inst = Tomd()
convert = _inst.convert
empty 2024-07-03 09:41:44 +00:00			`# coding: utf-8`

. 2019-11-08 03:27:23 +00:00			`import re`
empty 2024-07-03 09:41:44 +00:00			`import requests`
			`import json`
. 2019-11-08 03:27:23 +00:00
			`__all__ = ['Tomd', 'convert']`

			`MARKDOWN = {`
			`'h1': ('\n# ', '\n'),`
			`'h2': ('\n## ', '\n'),`
			`'h3': ('\n### ', '\n'),`
			`'h4': ('\n#### ', '\n'),`
			`'h5': ('\n##### ', '\n'),`
			`'h6': ('\n###### ', '\n'),`
			'code': ('`', '`'),
			`'ul': ('', ''),`
			`'ol': ('', ''),`
			`'li': ('- ', ''),`
			`'blockquote': ('\n> ', '\n'),`
			`'em': ('', ''),`
			`'strong': ('', ''),`
			'block_code': ('\n```\n', '\n```\n'),
			`'span': ('', ''),`
			`'p': ('\n', '\n'),`
			`'p_with_out_class': ('\n', '\n'),`
			`'inline_p': ('', ''),`
			`'inline_p_with_out_class': ('', ''),`
			`'b': ('', ''),`
			`'i': ('', ''),`
			`'del': ('~~', '~~'),`
			`'hr': ('\n---', '\n\n'),`
			`'thead': ('\n', '\|------\n'),`
			`'tbody': ('\n', '\n'),`
			`'td': ('\|', ''),`
			`'th': ('\|', ''),`
			`'tr': ('', '\n')`
			`}`

			`BlOCK_ELEMENTS = {`
			`'h1': '<h1.?>(.?)</h1>',`
			`'h2': '<h2.?>(.?)</h2>',`
			`'h3': '<h3.?>(.?)</h3>',`
			`'h4': '<h4.?>(.?)</h4>',`
			`'h5': '<h5.?>(.?)</h5>',`
			`'h6': '<h6.?>(.?)</h6>',`
			`'hr': '<hr/>',`
			`'blockquote': '<blockquote.?>(.?)</blockquote>',`
			`'ul': '<ul.?>(.?)</ul>',`
			`'ol': '<ol.?>(.?)</ol>',`
			`'block_code': '<pre.?><code.?>(.*?)</code></pre>',`
			`'p': '<p\s.?>(.?)</p>',`
			`'p_with_out_class': '<p>(.*?)</p>',`
			`'thead': '<thead.?>(.?)</thead>',`
add 2024-08-28 07:23:08 +00:00			`'tr': '<tr>(.*?)</tr>',`
. 2019-11-08 03:27:23 +00:00			`}`

			`INLINE_ELEMENTS = {`
			`'td': '<td>(.*?)</td>',`
			`'tr': '<tr>(.*?)</tr>',`
			`'th': '<th>(.*?)</th>',`
			`'b': '<b>(.*?)</b>',`
			`'i': '<i>(.*?)</i>',`
			`'del': '<del>(.*?)</del>',`
			`'inline_p': '<p\s.?>(.?)</p>',`
			`'inline_p_with_out_class': '<p>(.*?)</p>',`
			`'code': '<code.?>(.?)</code>',`
			`'span': '<span.?>(.?)</span>',`
			`'ul': '<ul.?>(.?)</ul>',`
			`'ol': '<ol.?>(.?)</ol>',`
			`'li': '<li.?>(.?)</li>',`
			`'a': '<a.?href="(.?)".?>(.?)</a>',`
			`'em': '<em.?>(.?)</em>',`
empty 2024-07-03 09:41:44 +00:00			`'strong': '<strong.?>(.?)</strong>',`
add 2024-08-28 07:23:08 +00:00			`'math':'<span><span><span>(.*?)</span></span></span>',`
			`'img': '<img.?src="(.?)".*?/>',`
. 2019-11-08 03:27:23 +00:00			`}`

			`DELETE_ELEMENTS = ['<span.?>', '</span>', '<div.?>', '</div>']`


			`class Element:`
			`def __init__(self, start_pos, end_pos, content, tag, is_block=False):`
			`self.start_pos = start_pos`
			`self.end_pos = end_pos`
			`self.content = content`
			`self._elements = []`
			`self.is_block = is_block`
			`self.tag = tag`
			`self._result = None`

add 2024-08-28 07:23:08 +00:00			`print(str(content))`
. 2019-11-08 03:27:23 +00:00			`if self.is_block:`
			`self.parse_inline()`

			`def __str__(self):`
			`wrapper = MARKDOWN.get(self.tag)`
			`self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])`
			`return self._result`

			`def parse_inline(self):`
			`for tag, pattern in INLINE_ELEMENTS.items():`
			`if tag == 'a':`
			`self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)`
			`elif tag == 'img':`
add 2024-08-28 07:23:08 +00:00			`self.content = re.sub(pattern, '![](\g<1>)', self.content)`
			`self.content = self.content`
. 2019-11-08 03:27:23 +00:00			`elif self.tag == 'ul' and tag == 'li':`
			`self.content = re.sub(pattern, '- \g<1>', self.content)`
			`elif self.tag == 'ol' and tag == 'li':`
			`self.content = re.sub(pattern, '1. \g<1>', self.content)`
			`elif self.tag == 'thead' and tag == 'tr':`
			`self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', ''))`
			`elif self.tag == 'tr' and tag == 'th':`
			`self.content = re.sub(pattern, '\|\g<1>', self.content.replace('\n', ''))`
			`elif self.tag == 'tr' and tag == 'td':`
			`self.content = re.sub(pattern, '\|\g<1>', self.content.replace('\n', ''))`
empty 2024-07-03 09:41:44 +00:00			`elif self.tag == 'math':`
			`self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)`
. 2019-11-08 03:27:23 +00:00			`else:`
测试 2024-07-10 00:11:19 +00:00			`wrapper = MARKDOWN.get(tag)`
			`if wrapper != None:`
			`self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)`
. 2019-11-08 03:27:23 +00:00

			`class Tomd:`
			`def __init__(self, html='', options=None):`
			`self.html = html`
			`self.options = options`
			`self._markdown = ''`

			`def convert(self, html, options=None):`
			`elements = []`
			`for tag, pattern in BlOCK_ELEMENTS.items():`
			`for m in re.finditer(pattern, html, re.I \| re.S \| re.M):`
			`element = Element(start_pos=m.start(),`
			`end_pos=m.end(),`
			`content=''.join(m.groups()),`
			`tag=tag,`
			`is_block=True)`
			`can_append = True`
			`for e in elements:`
			`if e.start_pos < m.start() and e.end_pos > m.end():`
			`can_append = False`
			`elif e.start_pos > m.start() and e.end_pos < m.end():`
			`elements.remove(e)`
			`if can_append:`
			`elements.append(element)`

			`elements.sort(key=lambda element: element.start_pos)`
empty 2024-07-03 09:41:44 +00:00			`for e in elements:`
测试 2024-07-10 00:11:19 +00:00			`print(e.content)`
			`if(str(e).find('<img') != -1) :`
			`print(str(e))`
empty 2024-07-03 09:41:44 +00:00			`imgs = re.findall(r"<img[\s\S]*?>",str(e),re.S)`
			`if imgs:`
			`print("found 1",len(imgs))`
			`print(imgs)`
			`reg = """<img[^>]+src=["']([^'"<>]+)["'][^<>]+/?>"""`
			`imgs2 = re.findall(reg,str(e))`
			`print("found2",len(imgs))`
			`i = 0`
			`# https://www.testingcloud.club/sapi/api/download_pic`
			`for img in imgs2:`
			`print(img)`
			`resp = requests.post("https://www.testingcloud.club/sapi/api/download_pic",json.dumps({`
			`"url": (img),`
			`}))`
			`obj = json.loads(resp.text)`
			`# print("http://127.0.0.1:4596/api/image_download/" + obj['url'])`
			`# https://www.testingcloud.club/sapi/api/image_download/`

			`e.content = str(e).replace(imgs[i],"![](https://www.testingcloud.club/sapi/api/image_download/" + obj['url'] + ")")`
			`i = i + 1`

			`# e.content = str(e).replace(img,"\r\n https://www.testingcloud.club/sapi/api/image_download/" + obj['url'])`




. 2019-11-08 03:27:23 +00:00			`self._markdown = ''.join([str(e) for e in elements])`

			`for index, element in enumerate(DELETE_ELEMENTS):`
			`self._markdown = re.sub(element, '', self._markdown)`
			`return self._markdown`

			`@property`
			`def markdown(self):`
			`self.convert(self.html, self.options)`
add 2024-08-28 07:23:08 +00:00			`print(self._markdown)`
			`imgori = re.findall('(?:!\[.?\]\((.?)\))',self._markdown)`
			`i = 0`
			`for img in imgori:`
			`print(img)`
			`resp = requests.post("https://www.testingcloud.club/sapi/api/download_pic",json.dumps({`
			`"url": (img),`
			`}))`
			`obj = json.loads(resp.text)`
			`# print("http://127.0.0.1:4596/api/image_download/" + obj['url'])`
			`# https://www.testingcloud.club/sapi/api/image_download/`

			`self._markdown = self._markdown.replace(imgori[i],"https://www.testingcloud.club/sapi/api/image_download/" + obj['url'])`
			`i = i + 1`

. 2019-11-08 03:27:23 +00:00			`return self._markdown`


			`_inst = Tomd()`
			`convert = _inst.convert`