用另一个数据帧替换整个数据帧（覆盖）（python3.4pandas）

if len(SframeDup.index) > 0 and dec == '1': SframeDup.to_csv('NWEA CSVs/Students/StudentDuplicates.csv', sep=',') print ("%d instances of repeated student IDs detected." % len(SframeDup.index)) print ("See StudentDuplicates.csv for duplicates.") print ("\nThis program will now stop.") raise SystemExit #quit() and exit() work too, but only in the editor #doing this in Ipython Notebook will restart the kernal and require #re-running and re-compiling preceeding code elif len(SframeDup.index) >0 and dec == '2': print ("%d instances of repeated student IDs detected." % len(SframeDup.index)) print ("See StudentDuplicates.csv for duplicates.") Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False) Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True) Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)] del Sframe['dup_check_1'], Sframe['dup_check_2'] else: print ("No duplicates found. Oh yeah!")

import pandas as pd import numpy as np import glob import csv import os import sys path = r'NWEA CSVs/Students/Raw' allFiles = glob.glob(path + "/*.csv") Sframe = pd.DataFrame() list = [] for file in allFiles: sdf = pd.read_csv(file,index_col=None, header=0) list.append(sdf) Sframe = pd.concat(list,ignore_index=False) Sframe.to_csv('NWEA CSVs/Students/OutStudents.csv', sep=',') Sframe["TermSchoolStudent"]=Sframe["TermName"]+Sframe["SchoolName"]+\ Sframe["StudentID"].map(str) SframeDup = Sframe[Sframe.duplicated("TermSchoolStudent") == True] if len(SframeDup.index) > 0: SframeDup.to_csv('NWEA CSVs/Students/StudentDuplicates.csv', sep=',') print ("%d instances of repeated student IDs detected." % len(SframeDup.index)) print ("See StudentDuplicates.csv for duplicates.") Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False) Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True) Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)] del Sframe['dup_check_1'], Sframe['dup_check_2'] print (len(Sframe))

dec = input("-->") if dec == 1: print ("This program will now stop.") print ("this_file.csv to resolve a problem.") raise SystemExit elif dec == 2: # add "Repeated" field to student with duplicates table. Values="NaN" SframeDup["Repeated"]="NaN" # New table joins (left, inner) Sframe with duplicates table (SframeDup) to # identify all rows of duplicates (including the unique values that had # duplicates) SframeWDup=pd.merge(Sframe, SframeDup, on='identifier', how='left') # Eliminate all repeating rows, including originals as pulled during left join SframeWODup=SframeWDup[SframeWDup.Repeated_y!="NaN"] # So here, in my mind, I should be able to just do this and the rest of # the code should treat replace Sframe with SframeWODup (without the found # duplicates)... Sframe = SframeWODup

2条回答

网友

1楼 · 编辑于 2024-10-02 12:32:27

我做了以下几件事，它奏效了：1。将if-if-elif结构替换为if-elif-else（见下文）。2将dec计算为字符串（即dec=='1'而不是dec==1）

if len(SframeDup.index) > 0 and dec == '1':
    SframeDup.to_csv('NWEA CSVs/Students/StudentDuplicates.csv', sep=',')
    print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
    print ("See StudentDuplicates.csv for duplicates.")
    print ("\nThis program will now stop.")
    raise SystemExit      

    #quit() and exit() work too, but only in the editor
    #doing this in Ipython Notebook will restart the kernal and require
    #re-running and re-compiling preceeding code
elif len(SframeDup.index) >0  and dec == '2':
    print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
    print ("See StudentDuplicates.csv for duplicates.")
    Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False)
    Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True)
    Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)]
    del Sframe['dup_check_1'], Sframe['dup_check_2']

else:
    print ("No duplicates found. Oh yeah!")

网友

2楼 · 编辑于 2024-10-02 12:32:27

尝试Sframe = SframeWODup.copy() 更新：你能用这段代码来达到你想要的结果吗？在

# Made-up data
Sframe = pd.DataFrame({'TermName': ['Fall', 'Fall', 'Fall', 'Fall'], 
'DistrictName': ['Downtown', 'Downtown', 'Downtown', 'Downtown'], 
'SchoolName': ['Seattle Central', 'Ballard', 'Ballard', 'Ballard'], 
'StudentLastName': ['Doe', 'Doe', 'Doe', 'Doe'], 
'StudentFirstName': ['John', 'Jane', 'Jane', 'Jane'],
'StudentMI': ['X', 'X', 'X', 'X'],
'StudentID': ['1234', '9876', '9876', '9876'],
'StudentDateOfBirth': ['2000-01-01', '2001-01-01', '2001-01-01', '2001-01-01'],
'StudentEthnicGroup': ['Asian American', 'White', 'White', 'White'],
'StudentGender': ['M', 'F', 'F', 'F'],
'Grade': ['10th', '9th', '9th', '9th'],
'TermSchoolStudent': ['Z', 'Z', 'Z', 'Z']})

# Remove duplicates based upon StudentID, in-place (i.e., modify object 'Sframe'). 
# UPDATE: I read that you want duplicates completely removed from data frame.
# Sframe.drop_duplicates(cols = ['StudentID'], take_last = False, inplace = True)

Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False)
Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True)
Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)]
del Sframe['dup_check_1'], Sframe['dup_check_2']

相关问题更多 >

编程相关推荐

热门问题

热门文章