数据字典逻辑到物理缩写名称转换

# empty dict declaration refDict = {} # to catch and report on any 'not-found' dictionary words to replace noMatchFound = {} # read from a dictionary of comma delimited dictionary # with open('dictionary.csv') as inputDict: # for line in inputDict: # busTerm, busAbbr = line.split(',') # refDict[busTerm] = busAbbr.replace("\n","") # sample data dictionary entries refDict = { 'user': 'USR', 'call': 'CALL', 'detail': 'DTL', 'record': 'REC', 'call detail record': 'CDR', 'count', 'CNT'} input_string1="user call detail record" # output should be "USR_CDR" # noMatchFound - will be empty - since all are matched and replaced input_string2="user test call detail record" # output should be "USR_TEST_CDR" # noMatchFound - should have an entry "TEST" with a refernce to "user test call detail record" input_string3="user call count detail record" # output should be "USR_CALL_CNT_DTL_REC" # noMatchFound - will be empty - since all are matched and replaced input_string4="user call detail record count" # output should be "USR_CDR_CNT" # noMatchFound - will be empty - since all are matched and replaced

import re # using regular expressions find longest matcing expression def getLongestSequenceSize(inputStr, inDict): ret_match = "" ret_match_len = 0 ret_abbr = "" for inKey in inDict: matches = re.findall(r'(?:\b%s\b\s?)+' % inKey.strip().upper(), inputStr.strip().upper()) if len(matches) > 0: longest_match = max(matches) if ret_match_len < len(longest_match): ret_match_len = len(longest_match) ret_match = longest_match.strip() ret_abbr = inDict[inKey] return [ret_match.strip(), ret_abbr.strip()]

1条回答

网友

1楼 · 发布于 2024-10-03 21:32:31

这个想法是你开始尝试replace()从字典中最大的字符串开始，然后检查给定字典的每一个可能的替换，从长到短。你知道吗

这正是你所期望的：

refDict = {
         'user': 'USR',
         'call': 'CALL',
         'detail': 'DTL',
         'record': 'REC',
         'call detail record': 'CDR',
         'count': 'CNT'}

sorted_ref = sorted( refDict.items(), key=lambda x:len(x[0]), reverse = True )

def do_work(input_string):
    noMatchFound = {}
    rval = input_string[:]
    for key, value in sorted_ref:
        rval = rval.replace(key, value)
    not_founds = [x for x in rval.split() if x.islower()]
    for not_found in not_founds:
        noMatchFound[not_found] = input_string
        rval = rval.replace(not_found, not_found.upper())
    rval = '_'.join( rval.split() )
    return rval, noMatchFound

inputs = ["user call detail record", "user test call detail record",
          "user call count detail record","user call detail record count"]

for inp in inputs:
    print inp
    output, noMatchFound = do_work(inp)
    print output
    print noMatchFound
    print ' -'

输出：

user call detail record
USR_CDR
{}
 -
user test call detail record
USR_TEST_CDR
{'test': 'user test call detail record'}
 -
user call count detail record
USR_CALL_CNT_DTL_REC
{}
 -
user call detail record count
USR_CDR_CNT
{}

输出：

相关问题更多 >

编程相关推荐

热门问题

热门文章