Python将列表中的部分字符串相互比较

2024-09-30 06:18:19 发布

您现在位置:Python中文网/ 问答频道 /正文

我正试图编写一个代码,将列表中的每个字符串相互比较,然后生成相似性的正则表达式

list = ["LONDON-UK-L16-N1",
        "LONDON-UK-L17-N1",
        "LONDON-UK-L16-N2",
        "LONDON-UK-L17-N2",
        "PARIS-France-L16-N2"]

我试图得到如下输出

LONDON-UK-L(16|17)-N(1|2)

可能吗?谢谢

更新:我只是想说清楚,我正在努力 输入:列表或字符串 操作:将列表项相互比较,检查相似性(以修复字符串的第一组),并对项的任何其他不相似部分使用正则表达式,因此我们可以使用单个输出(使用正则表达式),而不是for项 输出:要匹配的正则表达式不相似

输入: tez15-3-s1-y2 tez15-3-s2-y2 bro40-55-s1-y2

输出: tez15-3-s(1 | 2)-y2 ,bro40-55-s1-y2


Tags: 字符串代码列表相似性londonukn2s1
3条回答

我已经实施了以下解决方案:

import re 

data = [
  'LONDON-UK-L16-N1',
  'LONDON-UK-L17-N1',
  'LONDON-UK-L16-N2',
  'LONDON-UK-L16-N2',
  'PARIS-France-L16-N2'
]

def deconstruct(data):
  data = [y for y in [x.split('-') for x in data]]
  result = dict()

  for x in data:
    pointer = result

    for y in x:
      substr = re.findall('(\D+)', y)
      if substr:
        substr = substr[0]
        if not substr in pointer:
          pointer[substr] = {0: set()}
        pointer = pointer[substr]

      substr = re.findall('(\d+)', y)
      if substr:
        substr = substr[0]
        pointer[0].add(substr)

  return result

def construct(data, level=0):
  result = []

  for key in data.keys():
    if key != 0:
      if len(data[key][0]) == 1:
        nums = list(data[key][0])[0]
      elif len(data[key][0]) > 1:
        nums = '(' + '|'.join(sorted(list(data[key][0]))) + ')'
      else:
        nums = ''

      deeper_result = construct(data[key], level + 1)
      if not deeper_result:
        result.append([key + nums])
      else:
        for d in deeper_result:
          result.append([key + nums] + d)

  return result if level > 0 else ['-'.join(x) for x in result]

print(construct(deconstruct(data)))
# ['LONDON-UK-L(16|17)-N(1|2)', 'PARIS-France-L16-N2']

你的问题并不完全清楚确切的问题是什么。由于您作为示例提供的数据是一致且有序的,因此只需将列表中的项目拆分并进行分类,即可轻松解决此问题

loc_list = ["LONDON-UK-L16-N1", "LONDON-UK-L17-N1", "LONDON-UK-L16-N2", 
            "LONDON-UK-L16-N2", "PARIS-France-L16-N2"]

split_loc_list = [location.split("-")  for location in loc_list]

locs = {}

for loc in split_loc_list:
    locs.setdefault("-".join(loc[0:2]), {}).\
                        setdefault("L", set()).add(loc[2].strip("L"))

    locs.setdefault("-".join(loc[0:2]), {}).\
                        setdefault("N", set()).add(loc[3].strip("N"))

for loc, vals in locs.items():
    L_vals_sorted = sorted(list(map(int,vals["L"])))
    L_vals_joined = "|".join(map(str,L_vals_sorted))

    N_vals_sorted = sorted(list(map(int,vals["N"])))
    N_vals_joined = "|".join(map(str,N_vals_sorted))

    print(f"{loc}-L({L_vals_joined})-N({N_vals_joined})")

将输出:

LONDON-UK-L(16|17)-N(1|2)
PARIS-France-L(16)-N(2)

因为这里只有两个标记(“L”和“N”),所以我只是将它们写入代码中。如果可能有许多标签,则您可以使用以下方法按任意字母进行剥离:

import re
split = re.findall('\d+|\D+', loc[2])
key, val = split[0], split[1]
locs.setdefault("-".join(loc[0:2]), {}).\
                        setdefault(key, set()).add(val)

然后遍历所有标记,而不是在第二个循环中仅获取“L”和“N”

我就这个问题发布了这个新的(第二个)实现,我认为更准确,希望有帮助:

import re 

data = [
  'LONDON-UK-L16-N1',
  'LONDON-UK-L17-N1',
  'LONDON-UK-L16-N2',
  'LONDON-UK-L17-N2',
  'LONDON-UK-L18-N2',
  'PARIS-France-L16-N2',
]

def merge(data):
  data.sort()
  data = [y for y in [x.split('-') for x in data]]

  for col in range(len(data[0]) - 1, -1, -1):
    result = []

    def add_result():
      result.append([])
      if headstr:
        result[-1] += headstr.split('-')
      if len(list(findnum)) > 1:
        result[-1] += [f'{findstr}({"|".join(sorted(findnum))})']
      elif len(list(findnum)) == 1:
        result[-1] += [f'{findstr}{findnum[0]}']
      if tailstr:
        result[-1] += tailstr.split('-')

    _headstr = lambda x, y: '-'.join(x[:y])
    _tailstr = lambda x, y: '-'.join(x[y + 1:])
    _findstr = lambda x: re.findall('(\D+)', x)[0] if re.findall('(\D+)', x) else ''
    _findnum = lambda x: re.findall('(\d+)', x)[0] if re.findall('(\d+)', x) else ''

    headstr = _headstr(data[0], col)
    tailstr = _tailstr(data[0], col)
    findstr = _findstr(data[0][col])
    findnum = []

    for row in data:
      if headstr + findstr + tailstr != _headstr(row, col) + _findstr(row[col]) + _tailstr(row, col):
        add_result()
        headstr = _headstr(row, col)
        tailstr = _tailstr(row, col)
        findstr = _findstr(row[col])
        findnum = []
      if _findnum(row[col]) not in findnum:
        findnum.append(_findnum(row[col]))

    else:
        add_result()

    data = result[:]

  return ['-'.join(x) for x in result]

print(merge(data))  # ['LONDON-UK-L(16|17)-N(1|2)', 'LONDON-UK-L18-N2', 'PARIS-France-L16-N2']

相关问题 更多 >

    热门问题