用占位符替换字符串，并在函数后替换它们。

import re from copy import copy phrases = ["'s morgen", "'s-Hertogenbosch", "depository financial institution"] original_text = "Something, 's morgen, ik 's-Hertogenbosch im das depository financial institution gehen"

phrases = ["org", "'s-Hertogenbosch", "depository financial institution"] original_text = "Something, 's morgen, ik 's-Hertogenbosch im das depository financial institution gehen" backplacement = {} text = copy(original_text) for i, phrase in enumerate(phrases): backplacement["MWEPHRASE{}".format(i)] = phrase.replace(' ', '_') text = re.sub(r"{}".format(phrase), "MWEPHRASE{}".format(i), text) print(text)

phrases = ["'s morgen", "'s-Hertogenbosch", "depository financial institution"] original_text = "Something, 's morgen, ik 's-Hertogenbosch im das depository financial institution gehen" backplacement = {} text = copy(original_text) for i, phrase in enumerate(phrases): backplacement["MWEPHRASE{}".format(i)] = phrase.replace(' ', '_') text = re.sub(r"\b{}\b".format(phrase), "MWEPHRASE{}".format(i), text) print(text)

3条回答

网友

1楼 · 编辑于 2024-09-27 09:33:00

而不是使用re.sub公司你可以把它分开！在

def do_something_with_str(string):
    # do something with string here.
    # for example let's wrap the string with "@" symbol if it's not empty
    return f"@{string}" if string else string


def get_replaced_list(string, words):
    result = [(string, True), ]

    # we take each word we want to replace
    for w in words:

        new_result = []

        # Getting each word in old result
        for r in result:

            # Now we split every string in results using our word.
            split_list = list((x, True) for x in r[0].split(w)) if r[1] else list([r, ])

            # If we replace successfully - add all the strings
            if len(split_list) > 1:

                # This one would be for [text, replaced, text, replaced...]
                sub_result = []
                ws = [(w, False), ] * (len(split_list) - 1)
                for x, replaced in zip(split_list, ws):
                    sub_result.append(x)
                    sub_result.append(replaced)
                sub_result.append(split_list[-1])

                # Add to new result
                new_result.extend(sub_result)

            # If not - just add it to results
            else:
                new_result.extend(split_list)
        result = new_result
    return result


if __name__ == '__main__':
    initial_string = 'acbbcbbcacbbcbbcacbbcbbca'
    words_to_replace = ('a', 'c')
    replaced_list = get_replaced_list(initial_string, words_to_replace)
    modified_list = [(do_something_with_str(x[0]), True) if x[1] else x for x in replaced_list]
    final_string = ''.join([x[0] for x in modified_list])

以下是上例的变量值：

^{pr2}$

如您所见，列表包含元组。它们包含两个值-some string和boolean，表示它是文本还是被替换的值（True当文本时）。在得到替换列表之后，可以像示例一样修改它，检查它是否是文本值（if x[1] == True）。希望有帮助！在

p.S.字符串格式如f"some string here {some_variable_here}"需要python3.6

网友

2楼 · 编辑于 2024-09-27 09:33:00

下面是一个你可以使用的策略：

phrases = ["'s morgen", "'s-Hertogenbosch", "depository financial institution"]
original_text = "Something, 's morgen, ik 's-Hertogenbosch im das depository financial institution gehen"

# need this module for the reduce function
import functools as fn

#convert phrases into a dictionary of numbered placeholders (tokens)
tokens = { kw:"MWEPHRASE%s"%i for i,kw in enumerate(phrases) }

#replace embedded phrases with their respective token
tokenized = fn.reduce(lambda s,kw: tokens[kw].join(s.split(kw)), phrases, original_text)

#Apply text cleaning logic on the tokenized text 
#This assumes the placeholders are left untouched, 
#although it's ok to move them around)
cleaned_text = cleanUpfunction(tokenized)

#reverse the token dictionary (to map original phrases to numbered placeholders)
unTokens = {v:k for k,v in tokens.items() }

#rebuild phrases with original text associated to each token (placeholder)
final_text = fn.reduce(lambda s,kw: unTokens[kw].join(s.split(kw)), phrases, cleaned_text)

网友

3楼 · 编辑于 2024-09-27 09:33:00

我认为在这个任务中使用正则表达式有两个关键点：

使用自定义边界，捕捉它们，并将它们与短语一起替换回来。
使用函数在两个方向上处理替换匹配。

下面是一个使用这种方法的实现。我稍微修改了一下你的文字，重复其中一个短语。在

import re
from copy import copy 

original_text = "Something, 's morgen, ik 's-Hertogenbosch im das depository financial institution gehen 's morgen"
text = copy(original_text)

#
# The phrases of interest
#
phrases = ["'s morgen", "'s-Hertogenbosch", "depository financial institution"]

#
# Create the mapping dictionaries
#
phrase_to_mwe = {}
mwe_to_phrase = {}

#
# Build the mappings
#
for i, phrase in enumerate(phrases):

    mwephrase                = "MWEPHRASE{}".format(i)
    mwe_to_phrase[mwephrase] = phrase.replace(' ', '_')
    phrase_to_mwe[phrase]    = mwephrase

#
# Regex match handlers
#
def handle_forward(match):

    b1     = match.group(1)
    phrase = match.group(2)
    b2     = match.group(3)

    return b1 + phrase_to_mwe[phrase] + b2


def handle_backward(match):

    return mwe_to_phrase[match.group(1)]

#
# The forward regex will look like:
#
#    (^|[ ])('s morgen|'s-Hertogenbosch|depository financial institution)([, ]|$)
# 
# which captures three components:
#
#    (1) Front boundary
#    (2) Phrase
#    (3) Back boundary
#
# Anchors allow matching at the beginning and end of the text. Addtional boundary characters can be
# added as necessary, e.g. to allow semicolons after a phrase, we could update the back boundary to:
#
#    ([,; ]|$)
#
regex_forward  = re.compile(r'(^|[ ])(' + '|'.join(phrases) + r')([, ]|$)')
regex_backward = re.compile(r'(MWEPHRASE\d+)')

#
# Pretend we cleaned the text in the middle
#
cleaned = 'MWEPHRASE0 ik MWEPHRASE1 MWEPHRASE2 MWEPHRASE0'

#
# Do the translations
#
text1 = regex_forward .sub(handle_forward,  text)
text2 = regex_backward.sub(handle_backward, cleaned)

print('original: {}'.format(original_text))
print('text1   : {}'.format(text1))
print('text2   : {}'.format(text2))

运行此操作将生成：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章