我有一个pandas数据框,我也将其写入了文件中。我还以json格式为数据创建了一个模式。我将其存储为python字典,并将其写入文件
我尝试过使用to_gpq
和命令行上传,在这两种情况下,我都遇到了一个错误,关于重复使用同一个字段
以下是有关数据的信息:
代码
df.shape
len(clinvar_variant_schema)
schema_fields = [x['name'] for x in clinvar_variant_schema]
schema_fields.sort()
json.dumps(schema_fields)
colnames = df.columns.tolist()
colnames.sort()
json.dumps(colnames)
set(schema_fields).difference(set(colnames))
set(colnames).difference(set(schema_fields))
输出
(1000, 24)
24
'["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]'
'["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]'
set()
set()
Colnames Schema_Names
0 AF_ESP AF_ESP
1 AF_EXAC AF_EXAC
2 AF_TGP AF_TGP
3 ALLELEID ALLELEID
4 ALT ALT
5 CHROM CHROM
6 CLNDISDB CLNDISDB
7 CLNDN CLNDN
8 CLNHGVS CLNHGVS
9 CLNREVSTAT CLNREVSTAT
10 CLNSIG CLNSIG
11 CLNSIGCONF CLNSIGCONF
12 CLNVC CLNVC
13 CLNVCSO CLNVCSO
14 CLNVI CLNVI
15 FILTER FILTER
16 GENEINFO GENEINFO
17 ID ID
18 MC MC
19 ORIGIN ORIGIN
20 POS POS
21 QUAL QUAL
22 REF REF
23 RS RS
project_id = "my_project_id"
table_id = "my_dataset.clinvar_vcf"
df.to_gbq(
destination_table = table_id,
project_id = project_id,
if_exists = "replace",
table_schema = clinvar_variant_schema
)
IPython will make a temporary file named: /tmp/ipython_edit_pw1v55fy/ipython_edit_k3v1q7m5.py
1it [00:00, 2.93it/s]
---------------------------------------------------------------------------
InvalidResponse Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config)
1829 response = self._do_resumable_upload(
-> 1830 file_obj, job_resource, num_retries
1831 )
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _do_resumable_upload(self, stream, metadata, num_retries)
2153 upload, transport = self._initiate_resumable_upload(
-> 2154 stream, metadata, num_retries
2155 )
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _initiate_resumable_upload(self, stream, metadata, num_retries)
2195 upload.initiate(
-> 2196 transport, stream, metadata, _GENERIC_CONTENT_TYPE, stream_final=False
2197 )
/opt/conda/lib/python3.7/site-packages/google/resumable_media/requests/upload.py in initiate(self, transport, stream, metadata, content_type, total_bytes, stream_final, timeout)
411 )
--> 412 self._process_initiate_response(response)
413 return response
/opt/conda/lib/python3.7/site-packages/google/resumable_media/_upload.py in _process_initiate_response(self, response)
505 self._get_status_code,
--> 506 callback=self._make_invalid,
507 )
/opt/conda/lib/python3.7/site-packages/google/resumable_media/_helpers.py in require_status_code(response, status_codes, get_status_code, callback)
110 u"Expected one of",
--> 111 *status_codes
112 )
InvalidResponse: ('Request failed with status code', 400, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.CREATED: 201>)
During handling of the above exception, another exception occurred:
BadRequest Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar)
628 chunks = tqdm.tqdm(chunks)
--> 629 for remaining_rows in chunks:
630 logger.info(
/opt/conda/lib/python3.7/site-packages/tqdm/std.py in __iter__(self)
1173 try:
-> 1174 for obj in iterable:
1175 yield obj
/opt/conda/lib/python3.7/site-packages/pandas_gbq/load.py in load_chunks(client, dataframe, dataset_id, table_id, chunksize, schema, location)
81 job_config=job_config,
---> 82 location=location,
83 ).result()
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config)
1836 except resumable_media.InvalidResponse as exc:
-> 1837 raise exceptions.from_http_response(exc.response)
1838
BadRequest: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER
During handling of the above exception, another exception occurred:
GenericGBQException Traceback (most recent call last)
<ipython-input-55-19cb6dc0a4ee> in <module>
6 project_id = project_id,
7 if_exists = "replace",
----> 8 table_schema = clinvar_variant_schema
9 )
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in to_gbq(self, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials)
1655 location=location,
1656 progress_bar=progress_bar,
-> 1657 credentials=credentials,
1658 )
1659
/opt/conda/lib/python3.7/site-packages/pandas/io/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key)
226 credentials=credentials,
227 verbose=verbose,
--> 228 private_key=private_key,
229 )
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key)
1206 chunksize=chunksize,
1207 schema=table_schema,
-> 1208 progress_bar=progress_bar,
1209 )
1210
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar)
634 )
635 except self.http_error as ex:
--> 636 self.process_http_error(ex)
637
638 def schema(self, dataset_id, table_id):
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in process_http_error(ex)
433 # <https://cloud.google.com/bigquery/troubleshooting-errors>`__
434
--> 435 raise GenericGBQException("Reason: {0}".format(ex))
436
437 def run_query(
GenericGBQException: Reason: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER
bq
bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json my_project_id:my_dataset.clinvar_vcf clinvar_expanded_vcf
BigQuery error in load operation: Cannot load CSV data with a repeated field.
Field: FILTER
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
<ipython-input-45-be7b46a6694c> in <module>
----> 1 get_ipython().run_cell_magic('bash', '', 'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcf\n')
/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2379 with self.builtin_trap:
2380 args = (magic_arg_s, cell)
-> 2381 result = fn(*args, **kwargs)
2382 return result
2383
/opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell)
140 else:
141 line = script
--> 142 return self.shebang(line, cell)
143
144 # write a basic docstring:
<decorator-gen-103> in shebang(self, line, cell)
/opt/conda/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
/opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in shebang(self, line, cell)
243 sys.stderr.flush()
244 if args.raise_error and p.returncode!=0:
--> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err)
246
247 def _run_script(self, p, cell, to_close):
CalledProcessError: Command 'b'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcf\n'' returned non-zero exit status 1.
看起来CSV不支持嵌套或重复数据
https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#limitations
我相信默认情况下,to_gbq会转换为CSV,然后加载。因此,您可能希望使用CSV以外的其他格式
确保您的
clinvar_variant_schema
中没有'mode': 'REPEATED'
。还要检查表clinvar_vcf
是否已经存在于BigQuery及其列的模式中相关问题 更多 >
编程相关推荐