<p>以下是我最终使用的完整代码:</p>
<pre><code>#read in df containing actions in chunks:
tp = read_csv('/data/logactions.csv',
quoting=csv.QUOTE_NONNUMERIC,
iterator=True, chunksize=1000,
encoding='utf-8', skipinitialspace=True,
error_bad_lines=False)
df = concat([chunk for chunk in tp], ignore_index=True)
# set classes to NaN
df["klass"] = NaN
df = df[notnull(df['url'])]
df = df.reset_index(drop=True)
# iterate over text files, match, grab klass
startdate = date(2013, 1, 1)
enddate = date(2013, 1, 26)
d = startdate
while d <= enddate:
dstring = d.isoformat()
print dstring
# Read in each file w/ classifications in chunks
tp = read_csv('/data/textContentClassified/content{dstring}classfied.tsv'.format(**locals()),
sep = ',', quoting=csv.QUOTE_NONNUMERIC,
iterator=True, chunksize=1000,
encoding='utf-8', skipinitialspace=True,
error_bad_lines=False)
thisdatedf = concat([chunk for chunk in tp], ignore_index=True)
thisdatedf=thisdatedf.drop_duplicates(['url'])
thisdatedf=thisdatedf.reset_index(drop=True)
thisdatedf = thisdatedf[notnull(thisdatedf['url'])]
df["klass"] = df.klass.combine_first(thisdatedf.set_index('url').klass[df.url].reset_index(drop=True))
# Now iterate
d = d + timedelta(days=1)
</code></pre>