从Excelfile匹配Python中的正则表达式

import os import re import pymysql pymysql.install_as_MySQLdb() import pandas as pd import sqlalchemy def insert_or_update(engine, pd_table, table_name): inserts = 0 updates = 0 for i in range(len(pd_table)): vals_with_quotes = ["'" + str(x) + "'" for x in pd_table.loc[i, :].values] # print(vals_with_quotes) update_pairs = [str(c) + " = '" + str(v) + "'" for c, v in zip(pd_table.columns, pd_table.loc[i, :])] query = f"INSERT INTO {table_name} ({', '.join(list(pd_table.columns.values))}) " \ f"VALUES ({', '.join(vals_with_quotes)}) " \ f"ON DUPLICATE KEY UPDATE {', '.join(update_pairs)}" print(query) result = engine.execute(query) if result.lastrowid == 0: updates += 1 else: inserts += 1 print(f"Inserted {inserts} rows and updated {updates} rows.") schema = '---' alchemy_connect = "---" engine = sqlalchemy.create_engine(alchemy_connect) # connect to server engine.execute(f"USE {schema}") # select new db # engine.execute("SET NAMES UTF8MB4;") query = "SELECT * FROM .... where ...=..." pm = pd.read_sql(query, engine) rootpath = "path/" for root, dirs, files in os.walk(rootpath): for file in files: print(root, dirs, files, file) d = pd.read_excel(root + file, header=None) d.drop(columns=[0], inplace=True) d.rename(columns={1: "content"}, inplace=True) participants = [] for ix, row in d.iterrows(): c = row["content"] match = re.search(r"^(.+:)(.+)|(r(\w+)?\s*\[(.*)\]\s*(\w+))", c) prefix = match.group(1) only_content = match.group(2) try: timestamp = re.search(r"\[(\d{1,2}:\d{1,2}:\d{1,2})\]", prefix).group(1) except: timestamp = "-99" # print(timestamp) if re.search(r"\s(Versuchsleiter|ersuchsleiter|Versuchsleit|Versuch):", prefix): id_code = "Versuchsleiter" else: starting_digits = re.search(r"^(\d+)", prefix) id_code = re.search(r"(\d{2,4}.{1,3}):", prefix).group(1) if hasattr(starting_digits, 'group'): id_code = starting_digits.group(1) + id_code # # get pid participant = pm.loc[pm["id_code"] == id_code, "pid"] try: pid = participant.values[0] except: pid = "Versuchsleiter" # print(ix, pid, id_code, only_content, timestamp) if pid and pid not in participants and pid != "Versuchsleiter": participants.append(pid) d.loc[ix, "pid"] = pid d.loc[ix, "timestamp"] = timestamp d.loc[ix, "content"] = only_content.strip() d.loc[ix, "is_participant"] = 0 if pid == "Versuchsleiter" else 1 d = d[["pid", "is_participant", "content", "timestamp"]] d.loc[(d['pid'] == "Versuchsleiter"), "pid"] = participants[0] d.loc[(d['pid'] == None), "pid"] = participants[0] insert_or_update(engine, d, "table of sql")``` I need "Versuchsleiter" since some of the ID's are "Versuchsleiter" Thank you!

2条回答

网友

1楼 · 编辑于 2024-10-02 18:28:28

谢谢你的帮助，但这给了我以下的错误

Traceback (most recent call last):
  File "C:/Users/.../PycharmProjects/.../.../....py", line 80, in <module>
    insert_or_update(engine, d, "sql table")

TypeError: not enough arguments for format string

网友

2楼 · 编辑于 2024-10-02 18:28:28

您应该利用使用capturing groups的优势

所有初始正则表达式匹配（在c = row["content"]之后和# get pid之前）都可以用

match = re.search(r"^\[(\d{1,2}:\d{1,2}:\d{1,2})]\s+(\w+):\s*(.*)", c)
if match:
  timestamp = match.group(1)
  id_code = match.group(2)
  only_content = match.group(3)
  if re.search(r"(?:Versuch(?:sleit(?:er)?)?|ersuchsleiter)", id_code):
    id_code = "Versuchsleiter"

您的时间戳将是0:00:33，只有内容将包含(content)，id代码将包含30091aA

见regex demo

相关问题更多 >

编程相关推荐

热门问题

热门文章