import re __all__ = ['Tomd', 'convert'] MARKDOWN = { 'h1': ('\n# ', '\n'), 'h2': ('\n## ', '\n'), 'h3': ('\n### ', '\n'), 'h4': ('\n#### ', '\n'), 'h5': ('\n##### ', '\n'), 'h6': ('\n###### ', '\n'), 'code': ('`', '`'), 'ul': ('', ''), 'ol': ('', ''), 'li': ('- ', ''), 'blockquote': ('\n> ', '\n'), 'em': ('**', '**'), 'strong': ('**', '**'), 'block_code': ('\n```\n', '\n```\n'), 'span': ('', ''), 'p': ('\n', '\n'), 'p_with_out_class': ('\n', '\n'), 'inline_p': ('', ''), 'inline_p_with_out_class': ('', ''), 'b': ('**', '**'), 'i': ('*', '*'), 'del': ('~~', '~~'), 'hr': ('\n---', '\n\n'), 'thead': ('\n', '|------\n'), 'tbody': ('\n', '\n'), 'td': ('|', ''), 'th': ('|', ''), 'tr': ('', '\n') } BlOCK_ELEMENTS = { 'h1': '(.*?)', 'h2': '(.*?)', 'h3': '(.*?)', 'h4': '(.*?)', 'h5': '(.*?)', 'h6': '(.*?)', 'hr': '
', 'blockquote': '(.*?)', 'ul': '(.*?)', 'ol': '(.*?)', 'block_code': '(.*?)', 'p': '(.*?)

', 'p_with_out_class': '

(.*?)

', 'thead': '(.*?)', 'tr': '(.*?)' } INLINE_ELEMENTS = { 'td': '(.*?)', 'tr': '(.*?)', 'th': '(.*?)', 'b': '(.*?)', 'i': '(.*?)', 'del': '(.*?)', 'inline_p': '(.*?)

', 'inline_p_with_out_class': '

(.*?)

', 'code': '(.*?)', 'span': '(.*?)', 'ul': '(.*?)', 'ol': '(.*?)', 'li': '(.*?)', 'img': '(.*?)', 'a': '(.*?)', 'em': '(.*?)', 'strong': '(.*?)' } DELETE_ELEMENTS = ['', '', '', ''] class Element: def __init__(self, start_pos, end_pos, content, tag, is_block=False): self.start_pos = start_pos self.end_pos = end_pos self.content = content self._elements = [] self.is_block = is_block self.tag = tag self._result = None if self.is_block: self.parse_inline() def __str__(self): wrapper = MARKDOWN.get(self.tag) self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1]) return self._result def parse_inline(self): for tag, pattern in INLINE_ELEMENTS.items(): if tag == 'a': self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content) elif tag == 'img': self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content) elif self.tag == 'ul' and tag == 'li': self.content = re.sub(pattern, '- \g<1>', self.content) elif self.tag == 'ol' and tag == 'li': self.content = re.sub(pattern, '1. \g<1>', self.content) elif self.tag == 'thead' and tag == 'tr': self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', '')) elif self.tag == 'tr' and tag == 'th': self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) elif self.tag == 'tr' and tag == 'td': self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) else: wrapper = MARKDOWN.get(tag) self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content) class Tomd: def __init__(self, html='', options=None): self.html = html self.options = options self._markdown = '' def convert(self, html, options=None): elements = [] for tag, pattern in BlOCK_ELEMENTS.items(): for m in re.finditer(pattern, html, re.I | re.S | re.M): element = Element(start_pos=m.start(), end_pos=m.end(), content=''.join(m.groups()), tag=tag, is_block=True) can_append = True for e in elements: if e.start_pos < m.start() and e.end_pos > m.end(): can_append = False elif e.start_pos > m.start() and e.end_pos < m.end(): elements.remove(e) if can_append: elements.append(element) elements.sort(key=lambda element: element.start_pos) self._markdown = ''.join([str(e) for e in elements]) for index, element in enumerate(DELETE_ELEMENTS): self._markdown = re.sub(element, '', self._markdown) return self._markdown @property def markdown(self): self.convert(self.html, self.options) return self._markdown _inst = Tomd() convert = _inst.convert