import uuid
from bs4 import BeautifulSoup
from bs4.element import NavigableString
INLINE_TAGNAMES = ['b', 'strong', 'i', 'em', 'u']
INLINE_ENTITIES = ['a']
"""
Expects a body object (which is probably stored and used as JSON)
"""
def add_html_to_body(html, body=None):
if body:
body = body.copy()
else:
body = {'entityMap':{}, 'blocks': []}
soup = BeautifulSoup(html, 'lxml')
_element_to_block([soup.body], body['blocks'], body['entityMap'])
return body
def _element_to_block(parent_els, blocks, entity_map):
has_blocks = len(blocks)
if not has_blocks:
blocks.append(_create_block('', 'unstyled', [], []))
last_block = blocks[-1]
parent_el = parent_els[-1]
for el in parent_el.contents:
if type(el) == NavigableString:
# If this is part of an inline range, set it's length.
reversed_parent_els = parent_els[:]
reversed_parent_els.reverse()
# Keep a track of the number of parent style and entity tags
# we have already passed.
inline_tagnames_counter = 0
inline_entities_counter = 0
for i in xrange(len(reversed_parent_els)):
ancestor = reversed_parent_els[i]
if ancestor.name in INLINE_TAGNAMES:
inline_tagnames_counter += 1
last_block['inlineStyleRanges'][-1 * inline_tagnames_counter]['length'] += len(el.string)
elif ancestor.name in INLINE_ENTITIES:
inline_entities_counter += 1
last_block['entityRanges'][-1 * inline_entities_counter]['length'] += len(el.string)
last_block['text'] += el.string
elif el.name in ['b', 'strong']:
last_block['inlineStyleRanges'].append({
'offset': len(last_block['text']),
'length': 0,
'style': 'BOLD'
})
_element_to_block(parent_els[:] + [el], blocks, entity_map)
elif el.name in ['i', 'em']:
last_block['inlineStyleRanges'].append({
'offset': len(last_block['text']),
'length': 0,
'style': 'ITALIC'
})
_element_to_block(parent_els[:] + [el], blocks, entity_map)
elif el.name in ['div', 'p']:
# If the parent didn't give blocks, we create an empty starting one.
if has_blocks:
blocks.append(_create_block(el.string or '', 'unstyled', [], []))
_element_to_block([el], blocks, entity_map)
elif el.name == 'a':
# Create entity here.
entity_key = str(uuid.uuid4())[:8]
last_block['entityRanges'].append({
'offset': len(last_block['text']),
'length': len(el.string or ''),
'key': entity_key
})
entity_map[entity_key] = {
'type': 'LINK',
'mutability': 'IMMUTABLE',
'data': { 'url': el.get('href') }
}
_element_to_block(parent_els[:] + [el], blocks, entity_map)
# More elses based on the kind of elements you expect to see like
# u, ul, li, li, h1, h2, h3, table, etc. and entities with 'atomic' stuff.
def _create_block(text, type, inline_style_ranges, entity_ranges, data={}):
return {
'text': text,
'type': type,
'inlineStyleRanges': inline_style_ranges,
'entityRanges': entity_ranges,
'depth': 0,
'data': data,
'key': str(uuid.uuid4())[:8]
}
我也面临同样的情况,我的目标是使用BeautifulSoup递归解析HTML文档,并使用一些帮助函数将块构建为Python对象(您可以使用DraftJS的Python库来代替此步骤)。通过这种方式,您将100%控制转换,并且可以完全控制DraftJS文档,同时仔细考虑自定义块渲染和元素映射
如果你只是挽起袖子,这真的不难。在没有可用的库的情况下,我曾经担心这样的问题,但现在绝对不介意有机会不包含其他类似于这样的简单任务的依赖项
编辑:
进一步思考,你应该考虑在DoDJS中用DRAFTJS HTML& GT做这个过程;DraftJS库
或
我没有时间用它创建一个库,但是这里有一个使用
BeautifulSoup
实现这一点的代码的初始版本,应该可以让您开始:相关问题 更多 >
编程相关推荐