<p>由于合并的条件似乎很复杂,因此将数据加载到数据库并使用SQL可能是值得的。在内存中使用SQLite可以这样做(假设数据是逗号分隔的)</p>
<pre><code>import csv
import sqlite3
def createTable(cursor, rows, tablename):
tableCreated = False
for row in rows:
if not tableCreated:
sql = "CREATE TABLE %s(ROW INTEGER PRIMARY KEY, " + ", ".join(["c%d" % (i+1) for i in range(len(row))]) + ")"
cur.execute(sql % tablename)
tableCreated = True
sql = "INSERT INTO %s VALUES(NULL, " + ", ".join(["'" + c + "'" for c in row]) + ")"
cur.execute(sql % tablename)
conn.commit()
conn = sqlite3.connect(":memory:")
cur = conn.cursor()
for filename, tablename in [(path_to_csv1, "CSV1"), (path_to_csv2, "CSV2")]:
with open(filename, "r") as f:
reader = csv.reader(f, delimiter=',')
rows = [row for row in reader]
createTable(cur, rows, tablename)
</code></pre>
<p>然后可以在SQL中构造连接逻辑。您可以运行以下查询:</p>
<pre><code>for row in cur.execute(your_sql_statement):
print row
</code></pre>
<p>以下查询提供所需的输出:</p>
<pre><code>WITH
MATCHES AS( -- get all matches
SELECT CSV2.*
, CSV1.ROW as ROW_1
, CSV1.C4 as C4_1
, CSV1.C5 as C5_1
FROM CSV2
LEFT JOIN CSV1
ON CSV1.C4 LIKE '%' || CSV2.C2 || '%'
),
EXACT AS( -- matches where CSV1.C4 = CSV1.C5
SELECT *
FROM MATCHES
WHERE C4_1 = C5_1
),
MIN_ROW AS( -- CSV1.ROW of first occurence for each CSV2.C1
SELECT C1
, min(ROW_1) as ROW_1
FROM MATCHES
WHERE C1 NOT IN (SELECT C1 FROM EXACT)
GROUP BY C1, C2, C3, C4, C5
)
-- use C4=C5 first
SELECT *
FROM EXACT
UNION
-- if match not in exact, use first occurence
SELECT MATCHES.*
FROM MIN_ROW
INNER JOIN MATCHES
ON MIN_ROW.C1 = MATCHES.C1
AND (MIN_ROW.ROW_1 = MATCHES.ROW_1 OR MIN_ROW.ROW_1 IS NULL)
ORDER BY C1
</code></pre>