使用Pandas或命令行上载到BigQuery时出现奇怪的重复字段错误。所有字段都是唯一的

df.shape len(clinvar_variant_schema) schema_fields = [x['name'] for x in clinvar_variant_schema] schema_fields.sort() json.dumps(schema_fields) colnames = df.columns.tolist() colnames.sort() json.dumps(colnames) set(schema_fields).difference(set(colnames)) set(colnames).difference(set(schema_fields))

(1000, 24) 24 '["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]' '["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]' set() set() Colnames Schema_Names 0 AF_ESP AF_ESP 1 AF_EXAC AF_EXAC 2 AF_TGP AF_TGP 3 ALLELEID ALLELEID 4 ALT ALT 5 CHROM CHROM 6 CLNDISDB CLNDISDB 7 CLNDN CLNDN 8 CLNHGVS CLNHGVS 9 CLNREVSTAT CLNREVSTAT 10 CLNSIG CLNSIG 11 CLNSIGCONF CLNSIGCONF 12 CLNVC CLNVC 13 CLNVCSO CLNVCSO 14 CLNVI CLNVI 15 FILTER FILTER 16 GENEINFO GENEINFO 17 ID ID 18 MC MC 19 ORIGIN ORIGIN 20 POS POS 21 QUAL QUAL 22 REF REF 23 RS RS

使用熊猫

project_id = "my_project_id" table_id = "my_dataset.clinvar_vcf" df.to_gbq( destination_table = table_id, project_id = project_id, if_exists = "replace", table_schema = clinvar_variant_schema )

IPython will make a temporary file named: /tmp/ipython_edit_pw1v55fy/ipython_edit_k3v1q7m5.py 1it [00:00, 2.93it/s] --------------------------------------------------------------------------- InvalidResponse Traceback (most recent call last) /opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config) 1829 response = self._do_resumable_upload( -> 1830 file_obj, job_resource, num_retries 1831 ) /opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _do_resumable_upload(self, stream, metadata, num_retries) 2153 upload, transport = self._initiate_resumable_upload( -> 2154 stream, metadata, num_retries 2155 ) /opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _initiate_resumable_upload(self, stream, metadata, num_retries) 2195 upload.initiate( -> 2196 transport, stream, metadata, _GENERIC_CONTENT_TYPE, stream_final=False 2197 ) /opt/conda/lib/python3.7/site-packages/google/resumable_media/requests/upload.py in initiate(self, transport, stream, metadata, content_type, total_bytes, stream_final, timeout) 411 ) --> 412 self._process_initiate_response(response) 413 return response /opt/conda/lib/python3.7/site-packages/google/resumable_media/_upload.py in _process_initiate_response(self, response) 505 self._get_status_code, --> 506 callback=self._make_invalid, 507 ) /opt/conda/lib/python3.7/site-packages/google/resumable_media/_helpers.py in require_status_code(response, status_codes, get_status_code, callback) 110 u"Expected one of", --> 111 *status_codes 112 ) InvalidResponse: ('Request failed with status code', 400, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.CREATED: 201>) During handling of the above exception, another exception occurred: BadRequest Traceback (most recent call last) /opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar) 628 chunks = tqdm.tqdm(chunks) --> 629 for remaining_rows in chunks: 630 logger.info( /opt/conda/lib/python3.7/site-packages/tqdm/std.py in __iter__(self) 1173 try: -> 1174 for obj in iterable: 1175 yield obj /opt/conda/lib/python3.7/site-packages/pandas_gbq/load.py in load_chunks(client, dataframe, dataset_id, table_id, chunksize, schema, location) 81 job_config=job_config, ---> 82 location=location, 83 ).result() /opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config) 1836 except resumable_media.InvalidResponse as exc: -> 1837 raise exceptions.from_http_response(exc.response) 1838 BadRequest: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER During handling of the above exception, another exception occurred: GenericGBQException Traceback (most recent call last) <ipython-input-55-19cb6dc0a4ee> in <module> 6 project_id = project_id, 7 if_exists = "replace", ----> 8 table_schema = clinvar_variant_schema 9 ) /opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in to_gbq(self, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials) 1655 location=location, 1656 progress_bar=progress_bar, -> 1657 credentials=credentials, 1658 ) 1659 /opt/conda/lib/python3.7/site-packages/pandas/io/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key) 226 credentials=credentials, 227 verbose=verbose, --> 228 private_key=private_key, 229 ) /opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key) 1206 chunksize=chunksize, 1207 schema=table_schema, -> 1208 progress_bar=progress_bar, 1209 ) 1210 /opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar) 634 ) 635 except self.http_error as ex: --> 636 self.process_http_error(ex) 637 638 def schema(self, dataset_id, table_id): /opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in process_http_error(ex) 433 # <https://cloud.google.com/bigquery/troubleshooting-errors>`__ 434 --> 435 raise GenericGBQException("Reason: {0}".format(ex)) 436 437 def run_query( GenericGBQException: Reason: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER

使用命令行bq

bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json my_project_id:my_dataset.clinvar_vcf clinvar_expanded_vcf

BigQuery error in load operation: Cannot load CSV data with a repeated field. Field: FILTER --------------------------------------------------------------------------- CalledProcessError Traceback (most recent call last) <ipython-input-45-be7b46a6694c> in <module> ----> 1 get_ipython().run_cell_magic('bash', '', 'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcf\n') /opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2379 with self.builtin_trap: 2380 args = (magic_arg_s, cell) -> 2381 result = fn(*args, **kwargs) 2382 return result 2383 /opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell) 140 else: 141 line = script --> 142 return self.shebang(line, cell) 143 144 # write a basic docstring: <decorator-gen-103> in shebang(self, line, cell) /opt/conda/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k) 185 # but it's overkill for just that one bit of state. 186 def magic_deco(arg): --> 187 call = lambda f, *a, **k: f(*a, **k) 188 189 if callable(arg): /opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in shebang(self, line, cell) 243 sys.stderr.flush() 244 if args.raise_error and p.returncode!=0: --> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err) 246 247 def _run_script(self, p, cell, to_close): CalledProcessError: Command 'b'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcf\n'' returned non-zero exit status 1.

2条回答

网友

1楼 · 编辑于 2024-09-26 22:46:48

看起来CSV不支持嵌套或重复数据

https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#limitations

我相信默认情况下，to_gbq会转换为CSV，然后加载。因此，您可能希望使用CSV以外的其他格式

网友

2楼 · 编辑于 2024-09-26 22:46:48

确保您的clinvar_variant_schema中没有'mode': 'REPEATED'。还要检查表clinvar_vcf是否已经存在于BigQuery及其列的模式中

使用熊猫

使用命令行`bq`

相关问题更多 >

编程相关推荐

热门问题

热门文章