From 218b8426bfbfe52a1c00b9d32e8bc4f7b23b31ac Mon Sep 17 00:00:00 2001 From: Takashi Nishibayashi Date: Fri, 15 Apr 2016 03:44:41 +0900 Subject: [PATCH 01/78] Add stream insert options --- bigquery/client.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 33e8275..8e00e8b 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1137,7 +1137,8 @@ def wait_for_job(self, job, interval=5, timeout=60): return job_resource - def push_rows(self, dataset, table, rows, insert_id_key=None): + def push_rows(self, dataset, table, rows, insert_id_key=None, + skip_invalid_rows=None, ignore_unknown_values=None): """Upload rows to BigQuery table. Parameters @@ -1150,6 +1151,10 @@ def push_rows(self, dataset, table, rows, insert_id_key=None): A ``list`` of rows (``dict`` objects) to add to the table insert_id_key : str, optional Key for insertId in row + skip_invalid_rows : bool, optional + Insert all valid rows of a request, even if invalid rows exist. + ignore_unknown_values : bool, optional + Accept rows that contain values that do not match the schema. Returns ------- @@ -1173,6 +1178,12 @@ def push_rows(self, dataset, table, rows, insert_id_key=None): "rows": rows_data } + if skip_invalid_rows is not None: + data['skipInvalidRows'] = skip_invalid_rows + + if ignore_unknown_values is not None: + data['ignoreUnknownValues'] = ignore_unknown_values + try: response = table_data.insertAll( projectId=self.project_id, From bfbfde63702a9bf79579460b2e9fa0806bc0eaa0 Mon Sep 17 00:00:00 2001 From: Jason Bennett Date: Sun, 17 Apr 2016 14:32:52 -0700 Subject: [PATCH 02/78] Upgrade to latest OAuth library(2.0.2) and Google Python library (1.5.0) --- bigquery/client.py | 67 +++++++++++++++++++---------------- bigquery/tests/test_client.py | 64 ++++++++++++++++----------------- 2 files changed, 67 insertions(+), 64 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 33e8275..31f74d3 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -4,19 +4,19 @@ from collections import defaultdict from datetime import datetime, timedelta from hashlib import sha256 +from io import StringIO from time import sleep, time -import httplib2 import six -from apiclient.discovery import build, DISCOVERY_URI -from apiclient.errors import HttpError - from bigquery.errors import (BigQueryTimeoutException, JobExecutingException, JobInsertException, UnfinishedQueryException) from bigquery.schema_builder import schema_from_record +from googleapiclient.discovery import build, DISCOVERY_URI +from googleapiclient.errors import HttpError +from httplib2 import Http -BIGQUERY_SCOPE = 'https://www.googleapis.com/auth/bigquery' -BIGQUERY_SCOPE_READ_ONLY = 'https://www.googleapis.com/auth/bigquery.readonly' +BIGQUERY_SCOPE = ['https://www.googleapis.com/auth/bigquery'] +BIGQUERY_SCOPE_READ_ONLY = ['https://www.googleapis.com/auth/bigquery.readonly'] CACHE_TIMEOUT = timedelta(seconds=30) @@ -90,56 +90,63 @@ def get_client(project_id, credentials=None, """ if not credentials: - assert (service_account and (private_key or private_key_file)) or (json_key or json_key_file), \ + assert (service_account and (private_key or private_key_file)) or ( + json_key or json_key_file), \ 'Must provide AssertionCredentials or service account and P12 key or JSON key' if service_url is None: service_url = DISCOVERY_URI + scope = BIGQUERY_SCOPE_READ_ONLY if readonly else BIGQUERY_SCOPE + if private_key_file: - with open(private_key_file, 'rb') as key_file: - private_key = key_file.read() + credentials = _credentials().from_p12_keyfile(service_account, + private_key_file, + scopes=scope) + + if private_key: + try: + if isinstance(private_key, basestring): + private_key = private_key.decode('utf-8') + except NameError: + # python3 -- private_key is already unicode + pass + credentials = _credentials().from_p12_keyfile_buffer( + service_account, + StringIO(private_key), + scopes=scope) if json_key_file: - with open(json_key_file, 'r') as key_file: - json_key = json.load(key_file) + credentials = _credentials().from_json_keyfile_name(json_key_file, + scopes=scope) if json_key: - service_account = json_key['client_email'] - private_key = json_key['private_key'] + credentials = _credentials().from_json_keyfile_dict(json_key, + scopes=scope) bq_service = _get_bq_service(credentials=credentials, - service_url=service_url, - service_account=service_account, - private_key=private_key, - readonly=readonly) + service_url=service_url) return BigQueryClient(bq_service, project_id, swallow_results) -def _get_bq_service(credentials=None, service_url=None, service_account=None, private_key=None, - readonly=True): +def _get_bq_service(credentials=None, service_url=None): """Construct an authorized BigQuery service object.""" - assert credentials or (service_account and private_key), \ - 'Must provide AssertionCredentials or service account and key' - - if not credentials: - scope = BIGQUERY_SCOPE_READ_ONLY if readonly else BIGQUERY_SCOPE - credentials = _credentials()(service_account, private_key, scope=scope) + assert credentials, 'Must provide ServiceAccountCredentials' - http = httplib2.Http() - http = credentials.authorize(http) - service = build('bigquery', 'v2', http=http, discoveryServiceUrl=service_url) + http = credentials.authorize(Http()) + service = build('bigquery', 'v2', http=http, + discoveryServiceUrl=service_url) return service def _credentials(): """Import and return SignedJwtAssertionCredentials class""" - from oauth2client.client import SignedJwtAssertionCredentials + from oauth2client.service_account import ServiceAccountCredentials - return SignedJwtAssertionCredentials + return ServiceAccountCredentials class BigQueryClient(object): diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index f7050c6..d86c4c8 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -2,18 +2,16 @@ import mock import six -from nose.tools import raises - -from apiclient.errors import HttpError from bigquery import client from bigquery.errors import ( JobInsertException, JobExecutingException, BigQueryTimeoutException ) +from googleapiclient.errors import HttpError +from nose.tools import raises class HttpResponse(object): - def __init__(self, status, reason='There was an error'): """ Args: @@ -24,7 +22,6 @@ def __init__(self, status, reason='There was an error'): class TestGetClient(unittest.TestCase): - def setUp(self): client._bq_client = None @@ -51,7 +48,7 @@ def test_initialize_readonly(self, mock_build, mock_return_cred): mock_cred = mock.Mock() mock_http = mock.Mock() mock_service_url = mock.Mock() - mock_cred.return_value.authorize.return_value = mock_http + mock_cred.from_p12_keyfile_buffer.return_value.authorize.return_value = mock_http mock_bq = mock.Mock() mock_build.return_value = mock_bq key = 'key' @@ -65,9 +62,11 @@ def test_initialize_readonly(self, mock_build, mock_return_cred): readonly=True) mock_return_cred.assert_called_once_with() - mock_cred.assert_called_once_with(service_account, key, - scope=BIGQUERY_SCOPE_READ_ONLY) - self.assertTrue(mock_cred.return_value.authorize.called) + mock_cred.from_p12_keyfile_buffer.assert_called_once_with( + service_account, mock.ANY, + scopes=BIGQUERY_SCOPE_READ_ONLY) + self.assertTrue( + mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, discoveryServiceUrl=mock_service_url) self.assertEquals(mock_bq, bq_client.bigquery) @@ -84,7 +83,7 @@ def test_initialize_read_write(self, mock_build, mock_return_cred): mock_cred = mock.Mock() mock_http = mock.Mock() mock_service_url = mock.Mock() - mock_cred.return_value.authorize.return_value = mock_http + mock_cred.from_p12_keyfile_buffer.return_value.authorize.return_value = mock_http mock_bq = mock.Mock() mock_build.return_value = mock_bq key = 'key' @@ -98,9 +97,10 @@ def test_initialize_read_write(self, mock_build, mock_return_cred): readonly=False) mock_return_cred.assert_called_once_with() - mock_cred.assert_called_once_with(service_account, key, - scope=BIGQUERY_SCOPE) - self.assertTrue(mock_cred.return_value.authorize.called) + mock_cred.from_p12_keyfile_buffer.assert_called_once_with( + service_account, mock.ANY, scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, discoveryServiceUrl=mock_service_url) self.assertEquals(mock_bq, bq_client.bigquery) @@ -108,9 +108,7 @@ def test_initialize_read_write(self, mock_build, mock_return_cred): @mock.patch('bigquery.client._credentials') @mock.patch('bigquery.client.build') - @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') - def test_initialize_key_file(self, mock_open, mock_build, - mock_return_cred): + def test_initialize_key_file(self, mock_build, mock_return_cred): """Ensure that a BigQueryClient is initialized and returned with read/write permissions using a private key file. """ @@ -119,12 +117,10 @@ def test_initialize_key_file(self, mock_open, mock_build, mock_cred = mock.Mock() mock_http = mock.Mock() mock_service_url = mock.Mock() - mock_cred.return_value.authorize.return_value = mock_http + mock_cred.from_p12_keyfile.return_value.authorize.return_value = mock_http mock_bq = mock.Mock() mock_build.return_value = mock_bq key_file = 'key.pem' - key = 'key' - mock_open.return_value.__enter__.return_value.read.return_value = key service_account = 'account' project_id = 'project' mock_return_cred.return_value = mock_cred @@ -134,11 +130,12 @@ def test_initialize_key_file(self, mock_open, mock_build, service_account=service_account, private_key_file=key_file, readonly=False) - mock_open.assert_called_once_with(key_file, 'rb') mock_return_cred.assert_called_once_with() - mock_cred.assert_called_once_with(service_account, key, - scope=BIGQUERY_SCOPE) - self.assertTrue(mock_cred.return_value.authorize.called) + mock_cred.from_p12_keyfile.assert_called_once_with(service_account, + key_file, + scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_p12_keyfile.return_value.authorize.called) mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, discoveryServiceUrl=mock_service_url) self.assertEquals(mock_bq, bq_client.bigquery) @@ -146,34 +143,33 @@ def test_initialize_key_file(self, mock_open, mock_build, @mock.patch('bigquery.client._credentials') @mock.patch('bigquery.client.build') - @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') - def test_initialize_json_key_file(self, mock_open, mock_build, mock_return_cred): + def test_initialize_json_key_file(self, mock_build, mock_return_cred): """Ensure that a BigQueryClient is initialized and returned with read/write permissions using a JSON key file. """ from bigquery.client import BIGQUERY_SCOPE - import json mock_cred = mock.Mock() mock_http = mock.Mock() mock_service_url = mock.Mock() - mock_cred.return_value.authorize.return_value = mock_http + mock_cred.from_json_keyfile_name.return_value.authorize.return_value = mock_http mock_bq = mock.Mock() mock_build.return_value = mock_bq json_key_file = 'key.json' - json_key = {'client_email': 'mail', 'private_key': 'pkey'} - mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(json_key) project_id = 'project' mock_return_cred.return_value = mock_cred bq_client = client.get_client( - project_id, service_url=mock_service_url, json_key_file=json_key_file, readonly=False) + project_id, service_url=mock_service_url, + json_key_file=json_key_file, readonly=False) - mock_open.assert_called_once_with(json_key_file, 'r') mock_return_cred.assert_called_once_with() - mock_cred.assert_called_once_with(json_key['client_email'], json_key['private_key'], scope=BIGQUERY_SCOPE) - self.assertTrue(mock_cred.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, discoveryServiceUrl=mock_service_url) + mock_cred.from_json_keyfile_name.assert_called_once_with(json_key_file, + scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_json_keyfile_name.return_value.authorize.called) + mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, + discoveryServiceUrl=mock_service_url) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) From 5a9290a3d43e904efb248eda82930de918356436 Mon Sep 17 00:00:00 2001 From: Takashi Nishibayashi Date: Mon, 18 Apr 2016 12:12:52 +0900 Subject: [PATCH 03/78] Add tests --- bigquery/tests/test_client.py | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index f7050c6..3cf84c8 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -2108,6 +2108,47 @@ def test_push_success(self): self.mock_table_data.insertAll.return_value.execute.assert_has_calls( execute_calls) + def test_request_data_with_options(self): + """Ensure that insertAll body has optional property only when + the optional parameter of push_rows passed. + """ + expected_body = self.data.copy() + + self.client.push_rows( + self.dataset, self.table, self.rows, + insert_id_key='one') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + self.client.push_rows( + self.dataset, self.table, self.rows, + insert_id_key='one', + ignore_unknown_values=False, + skip_invalid_rows=False) + expected_body['ignoreUnknownValues'] = False + expected_body['skipInvalidRows'] = False + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + self.client.push_rows( + self.dataset, self.table, self.rows, + insert_id_key='one', + ignore_unknown_values=True, + skip_invalid_rows=True) + expected_body['ignoreUnknownValues'] = True + expected_body['skipInvalidRows'] = True + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + class TestGetAllTables(unittest.TestCase): From 6436432cc490e835506a85e9fb7cab216a84deca Mon Sep 17 00:00:00 2001 From: Takashi Nishibayashi Date: Fri, 22 Apr 2016 15:41:13 +0900 Subject: [PATCH 04/78] Use package logger instead of root logger --- bigquery/client.py | 50 ++++++++++++++++++++------------------- bigquery/query_builder.py | 16 ++++++------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index a3c64ec..68d3bb5 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1,6 +1,6 @@ import calendar import json -import logging +from logging import getLogger from collections import defaultdict from datetime import datetime, timedelta from hashlib import sha256 @@ -42,6 +42,8 @@ JOB_FORMAT_NEWLINE_DELIMITED_JSON JOB_DESTINATION_FORMAT_CSV = JOB_FORMAT_CSV +logger = getLogger(__name__) + def get_client(project_id, credentials=None, service_url=None, service_account=None, @@ -186,7 +188,7 @@ def _submit_query_job(self, query_data): On timeout """ - logging.debug('Submitting query job: %s' % query_data) + logger.debug('Submitting query job: %s' % query_data) job_collection = self.bigquery.jobs() @@ -206,7 +208,7 @@ def _submit_query_job(self, query_data): # raise exceptions if it's not an async query # and job is not completed after timeout if not job_complete and query_data.get("timeoutMs", False): - logging.error('BigQuery job %s timeout' % job_id) + logger.error('BigQuery job %s timeout' % job_id) raise BigQueryTimeoutException() return job_id, [self._transform_row(row, schema) for row in rows] @@ -235,7 +237,7 @@ def _insert_job(self, body_object): BigQueryTimeoutException on timeout """ - logging.debug('Submitting job: %s' % body_object) + logger.debug('Submitting job: %s' % body_object) job_collection = self.bigquery.jobs() @@ -274,7 +276,7 @@ def query(self, query, max_results=None, timeout=0, dry_run=False): on timeout """ - logging.debug('Executing query: %s' % query) + logger.debug('Executing query: %s' % query) query_data = { 'query': query, @@ -301,7 +303,7 @@ def get_query_schema(self, job_id): query_reply = self.get_query_results(job_id, offset=0, limit=0) if not query_reply['jobComplete']: - logging.warning('BigQuery job %s not complete' % job_id) + logger.warning('BigQuery job %s not complete' % job_id) raise UnfinishedQueryException() return query_reply['schema']['fields'] @@ -330,7 +332,7 @@ def get_table_schema(self, dataset, table): datasetId=dataset).execute() except HttpError as e: if int(e.resp['status']) == 404: - logging.warn('Table %s.%s does not exist', dataset, table) + logger.warn('Table %s.%s does not exist', dataset, table) return None raise @@ -383,7 +385,7 @@ def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): # Get query results query_reply = self.get_query_results(job_id, offset=offset, limit=limit, timeout=timeout) if not query_reply['jobComplete']: - logging.warning('BigQuery job %s not complete' % job_id) + logger.warning('BigQuery job %s not complete' % job_id) raise UnfinishedQueryException() schema = query_reply["schema"]["fields"] @@ -524,7 +526,7 @@ def create_table(self, dataset, table, schema, expiration_time=None): return table except HttpError as e: - logging.error(('Cannot create table {0}.{1}\n' + logger.error(('Cannot create table {0}.{1}\n' 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: @@ -572,7 +574,7 @@ def update_table(self, dataset, table, schema): return result except HttpError as e: - logging.error(('Cannot update table {0}.{1}\n' + logger.error(('Cannot update table {0}.{1}\n' 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: @@ -620,7 +622,7 @@ def patch_table(self, dataset, table, schema): return result except HttpError as e: - logging.error(('Cannot patch table {0}.{1}\n' + logger.error(('Cannot patch table {0}.{1}\n' 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: @@ -670,7 +672,7 @@ def create_view(self, dataset, view, query): return view except HttpError as e: - logging.error(('Cannot create view {0}.{1}\n' + logger.error(('Cannot create view {0}.{1}\n' 'Http Error: {2}').format(dataset, view, e.content)) if self.swallow_results: @@ -707,7 +709,7 @@ def delete_table(self, dataset, table): return response except HttpError as e: - logging.error(('Cannot delete table {0}.{1}\n' + logger.error(('Cannot delete table {0}.{1}\n' 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: @@ -900,7 +902,7 @@ def import_data_from_uris( } } - logging.debug("Creating load job %s" % body) + logger.debug("Creating load job %s" % body) job_resource = self._insert_job(body) self._raise_insert_exception_if_error(job_resource) return job_resource @@ -994,7 +996,7 @@ def export_data_to_uris( } } - logging.info("Creating export job %s" % body) + logger.info("Creating export job %s" % body) job_resource = self._insert_job(body) self._raise_insert_exception_if_error(job_resource) return job_resource @@ -1090,7 +1092,7 @@ def write_to_table( } } - logging.info("Creating write to table job %s" % body) + logger.info("Creating write to table job %s" % body) job_resource = self._insert_job(body) self._raise_insert_exception_if_error(job_resource) return job_resource @@ -1139,7 +1141,7 @@ def wait_for_job(self, job, interval=5, timeout=60): # raise exceptions if timeout if not complete: - logging.error('BigQuery job %s timeout' % job_id) + logger.error('BigQuery job %s timeout' % job_id) raise BigQueryTimeoutException() return job_resource @@ -1200,7 +1202,7 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, ).execute() if response.get('insertErrors'): - logging.error('BigQuery insert errors: %s' % response) + logger.error('BigQuery insert errors: %s' % response) if self.swallow_results: return False else: @@ -1212,7 +1214,7 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, return response except HttpError as e: - logging.exception('Problem with BigQuery insertAll') + logger.exception('Problem with BigQuery insertAll') if self.swallow_results: return False else: @@ -1573,7 +1575,7 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, else: return response except HttpError as e: - logging.error('Cannot create dataset {0}, {1}'.format(dataset_id, + logger.error('Cannot create dataset {0}, {1}'.format(dataset_id, e)) if self.swallow_results: return False @@ -1594,7 +1596,7 @@ def get_datasets(self): result = request.execute() return result.get('datasets', []) except HttpError as e: - logging.error("Cannot list datasets: {0}".format(e)) + logger.error("Cannot list datasets: {0}".format(e)) return None def delete_dataset(self, dataset_id, delete_contents=False): @@ -1630,7 +1632,7 @@ def delete_dataset(self, dataset_id, delete_contents=False): else: return response except HttpError as e: - logging.error('Cannot delete dataset {0}: {1}'.format(dataset_id, + logger.error('Cannot delete dataset {0}: {1}'.format(dataset_id, e)) if self.swallow_results: return False @@ -1673,7 +1675,7 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, else: return response except HttpError as e: - logging.error('Cannot update dataset {0}: {1}'.format(dataset_id, + logger.error('Cannot update dataset {0}: {1}'.format(dataset_id, e)) if self.swallow_results: return False @@ -1715,7 +1717,7 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, else: return response except HttpError as e: - logging.error('Cannot patch dataset {0}: {1}'.format(dataset_id, + logger.error('Cannot patch dataset {0}: {1}'.format(dataset_id, e)) if self.swallow_results: return False diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index cb5e60a..fb02896 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -1,4 +1,6 @@ -import logging +from logging import getLogger + +logger = getLogger(__name__) def render_query(dataset, tables, select=None, conditions=None, @@ -131,8 +133,7 @@ def _render_sources(dataset, tables): The data set to fetch log data from. tables : Union[dict, list] The tables to fetch log data from - - Returns +Returns ------- str A string that represents the "from" part of a query. @@ -147,8 +148,7 @@ def _render_sources(dataset, tables): tables['from_date'], tables['to_date']) except KeyError as exp: - logging.warn('Missing parameter %s in selecting sources' % - (exp)) + logger.warn('Missing parameter %s in selecting sources' % (exp)) else: return "FROM " + ", ".join( @@ -184,7 +184,7 @@ def _render_conditions(conditions): comparators = condition.get('comparators') if None in (field, field_type, comparators) or not comparators: - logging.warn('Invalid condition passed in: %s' % condition) + logger.warn('Invalid condition passed in: %s' % condition) continue rendered_conditions.append( @@ -239,7 +239,7 @@ def _render_condition(field, field_type, comparators): for v in value]) ) elif isinstance(value, (tuple, list, set)) and len(value) != 2: - logging.warn('Invalid condition passed in: %s' % condition) + logger.warn('Invalid condition passed in: %s' % condition) else: value = _render_condition_value(value, field_type) @@ -335,7 +335,7 @@ def _render_having(having_conditions): comparators = condition.get('comparators') if None in (field, field_type, comparators) or not comparators: - logging.warn('Invalid condition passed in: %s' % condition) + logger.warn('Invalid condition passed in: %s' % condition) continue rendered_conditions.append( From 96789445309a33b4b00ce36c94918c5f7a445922 Mon Sep 17 00:00:00 2001 From: Takashi Nishibayashi Date: Fri, 22 Apr 2016 16:22:21 +0900 Subject: [PATCH 05/78] Revert comment line --- bigquery/query_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index fb02896..c149eb1 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -133,7 +133,7 @@ def _render_sources(dataset, tables): The data set to fetch log data from. tables : Union[dict, list] The tables to fetch log data from -Returns + Returns ------- str A string that represents the "from" part of a query. From 22d3e5801df74bb6d4182343c0f0d34691844b99 Mon Sep 17 00:00:00 2001 From: Takashi Nishibayashi Date: Fri, 22 Apr 2016 16:24:01 +0900 Subject: [PATCH 06/78] Revart unnecesarry change --- bigquery/query_builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index c149eb1..b6f568b 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -133,6 +133,7 @@ def _render_sources(dataset, tables): The data set to fetch log data from. tables : Union[dict, list] The tables to fetch log data from + Returns ------- str From 7ce159a4002fe75a0eb2165afe11bf100f0e9b88 Mon Sep 17 00:00:00 2001 From: orangain Date: Sat, 23 Apr 2016 14:48:50 +0900 Subject: [PATCH 07/78] Read project_id from JSON key file. A JSON key file provided by Google contains project_id. Now project_id argument of get_client() is optional and read from the JSON key file if json_key or json_key_file is provided. I believe this improve usability of get_client(). --- README.md | 2 +- bigquery/client.py | 16 ++++++++---- bigquery/tests/test_client.py | 48 ++++++++++++++++++++++++++++++++--- 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8d42cb3..6b4606c 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ client = get_client(project_id, service_account=service_account, # JSON key provided by Google json_key = 'key.json' -client = get_client(project_id, json_key_file=json_key, readonly=True) +client = get_client(json_key_file=json_key, readonly=True) # Submit an async query. job_id, _results = client.query('SELECT * FROM dataset.my_table LIMIT 1000') diff --git a/bigquery/client.py b/bigquery/client.py index 68d3bb5..cacb876 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -45,7 +45,7 @@ logger = getLogger(__name__) -def get_client(project_id, credentials=None, +def get_client(project_id=None, credentials=None, service_url=None, service_account=None, private_key=None, private_key_file=None, json_key=None, json_key_file=None, @@ -56,8 +56,8 @@ def get_client(project_id, credentials=None, Parameters ---------- - project_id : str - The BigQuery project id + project_id : str, optional + The BigQuery project id, required unless json_key or json_key_file is provided. credentials : oauth2client.client.SignedJwtAssertionCredentials, optional AssertionCredentials instance to authenticate requests to BigQuery (optional, must provide `service_account` and (`private_key` or `private_key_file`) or @@ -96,6 +96,10 @@ def get_client(project_id, credentials=None, json_key or json_key_file), \ 'Must provide AssertionCredentials or service account and P12 key or JSON key' + if not project_id: + assert json_key or json_key_file, \ + 'Must provide project_id unless json_key or json_key_file is provided' + if service_url is None: service_url = DISCOVERY_URI @@ -119,12 +123,14 @@ def get_client(project_id, credentials=None, scopes=scope) if json_key_file: - credentials = _credentials().from_json_keyfile_name(json_key_file, - scopes=scope) + with open(json_key_file, 'r') as key_file: + json_key = json.load(key_file) if json_key: credentials = _credentials().from_json_keyfile_dict(json_key, scopes=scope) + if not project_id: + project_id = json_key['project_id'] bq_service = _get_bq_service(credentials=credentials, service_url=service_url) diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index be1ff0f..ffd7818 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -143,19 +143,23 @@ def test_initialize_key_file(self, mock_build, mock_return_cred): @mock.patch('bigquery.client._credentials') @mock.patch('bigquery.client.build') - def test_initialize_json_key_file(self, mock_build, mock_return_cred): + @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') + def test_initialize_json_key_file(self, mock_open, mock_build, mock_return_cred): """Ensure that a BigQueryClient is initialized and returned with read/write permissions using a JSON key file. """ from bigquery.client import BIGQUERY_SCOPE + import json mock_cred = mock.Mock() mock_http = mock.Mock() mock_service_url = mock.Mock() - mock_cred.from_json_keyfile_name.return_value.authorize.return_value = mock_http + mock_cred.from_json_keyfile_dict.return_value.authorize.return_value = mock_http mock_bq = mock.Mock() mock_build.return_value = mock_bq json_key_file = 'key.json' + json_key = {'client_email': 'mail', 'private_key': 'pkey'} + mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(json_key) project_id = 'project' mock_return_cred.return_value = mock_cred @@ -164,15 +168,51 @@ def test_initialize_json_key_file(self, mock_build, mock_return_cred): json_key_file=json_key_file, readonly=False) mock_return_cred.assert_called_once_with() - mock_cred.from_json_keyfile_name.assert_called_once_with(json_key_file, + mock_cred.from_json_keyfile_dict.assert_called_once_with(json_key, scopes=BIGQUERY_SCOPE) self.assertTrue( - mock_cred.from_json_keyfile_name.return_value.authorize.called) + mock_cred.from_json_keyfile_dict.return_value.authorize.called) mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, discoveryServiceUrl=mock_service_url) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) + @mock.patch('bigquery.client._credentials') + @mock.patch('bigquery.client.build') + @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') + def test_initialize_json_key_file_without_project_id(self, mock_open, mock_build, + mock_return_cred): + """Ensure that a BigQueryClient is initialized and returned with + read/write permissions using a JSON key file without project_id. + """ + from bigquery.client import BIGQUERY_SCOPE + import json + + mock_cred = mock.Mock() + mock_http = mock.Mock() + mock_service_url = mock.Mock() + mock_cred.from_json_keyfile_dict.return_value.authorize.return_value = mock_http + mock_bq = mock.Mock() + mock_build.return_value = mock_bq + json_key_file = 'key.json' + json_key = {'client_email': 'mail', 'private_key': 'pkey', 'project_id': 'project'} + mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(json_key) + mock_return_cred.return_value = mock_cred + + bq_client = client.get_client( + service_url=mock_service_url, json_key_file=json_key_file, readonly=False) + + mock_open.assert_called_once_with(json_key_file, 'r') + mock_return_cred.assert_called_once_with() + mock_cred.from_json_keyfile_dict.assert_called_once_with(json_key, + scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_json_keyfile_dict.return_value.authorize.called) + mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, + discoveryServiceUrl=mock_service_url) + self.assertEquals(mock_bq, bq_client.bigquery) + self.assertEquals(json_key['project_id'], bq_client.project_id) + class TestQuery(unittest.TestCase): From 890270affac1138e6b568ef28a9cf325f6dfcf45 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Sat, 23 Apr 2016 13:14:07 -0500 Subject: [PATCH 08/78] Bump version to 1.7.0 --- bigquery/__init__.py | 2 +- setup.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bigquery/__init__.py b/bigquery/__init__.py index 086be47..3a4f000 100644 --- a/bigquery/__init__.py +++ b/bigquery/__init__.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -__version__ = '1.6.0' +__version__ = '1.7.0' from .client import get_client from .client import ( diff --git a/setup.py b/setup.py index b0c737b..acdaf5e 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,12 @@ from setuptools import find_packages from setuptools import setup - -VERSION = '1.6.0' +from bigquery import __version__ setup_args = dict( name='BigQuery-Python', description='Simple Python client for interacting with Google BigQuery.', url='https://github.com/tylertreat/BigQuery-Python', - version=VERSION, + version=__version__, license='Apache', packages=find_packages(), include_package_data=True, From 4f84b83a9cfa9cf243f784bac273ceea978102e0 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Sat, 23 Apr 2016 13:36:08 -0500 Subject: [PATCH 09/78] Various formatting fixes and fix import issue --- bigquery/client.py | 184 +++++++++++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 83 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index cacb876..9bab750 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -10,13 +10,17 @@ import six from bigquery.errors import (BigQueryTimeoutException, JobExecutingException, JobInsertException, UnfinishedQueryException) -from bigquery.schema_builder import schema_from_record from googleapiclient.discovery import build, DISCOVERY_URI from googleapiclient.errors import HttpError from httplib2 import Http -BIGQUERY_SCOPE = ['https://www.googleapis.com/auth/bigquery'] -BIGQUERY_SCOPE_READ_ONLY = ['https://www.googleapis.com/auth/bigquery.readonly'] +BIGQUERY_SCOPE = [ + 'https://www.googleapis.com/auth/bigquery' +] + +BIGQUERY_SCOPE_READ_ONLY = [ + 'https://www.googleapis.com/auth/bigquery.readonly' +] CACHE_TIMEOUT = timedelta(seconds=30) @@ -57,33 +61,37 @@ def get_client(project_id=None, credentials=None, Parameters ---------- project_id : str, optional - The BigQuery project id, required unless json_key or json_key_file is provided. + The BigQuery project id, required unless json_key or json_key_file is + provided. credentials : oauth2client.client.SignedJwtAssertionCredentials, optional - AssertionCredentials instance to authenticate requests to BigQuery (optional, - must provide `service_account` and (`private_key` or `private_key_file`) or - (`json_key` or `json_key_file`) if not included + AssertionCredentials instance to authenticate requests to BigQuery + (optional, must provide `service_account` and (`private_key` or + `private_key_file`) or (`json_key` or `json_key_file`) if not included service_url : str, optional - A URI string template pointing to the location of Google's API discovery - service. Requires two parameters {api} and {apiVersion} that when filled in - produce an absolute URI to the discovery document for that service. If not set - then the default googleapiclient discovery URI is used. See `credentials` + A URI string template pointing to the location of Google's API + discovery service. Requires two parameters {api} and {apiVersion} that + when filled in produce an absolute URI to the discovery document for + that service. If not set then the default googleapiclient discovery URI + is used. See `credentials` service_account : str, optional The Google API service account name. See `credentials` private_key : str, optional - The private key associated with the service account in PKCS12 or PEM format. See `credentials` + The private key associated with the service account in PKCS12 or PEM + format. See `credentials` private_key_file : str, optional - The name of the file containing the private key associated with the service - account in PKCS12 or PEM format. See `credentials` + The name of the file containing the private key associated with the + service account in PKCS12 or PEM format. See `credentials` json_key : dict, optional The JSON key associated with the service account. See `credentials` json_key_file : str, optional - The name of the JSON key file associated with the service account. See `credentials`. + The name of the JSON key file associated with the service account. See + `credentials`. readonly : bool - Bool indicating if BigQuery access is read-only. Has no effect if credentials are - provided. Default True. + Bool indicating if BigQuery access is read-only. Has no effect if + credentials are provided. Default True. swallow_results : bool - If set to False, then return the actual response value instead of converting to - boolean. Default True. + If set to False, then return the actual response value instead of + converting to boolean. Default True. Returns ------- @@ -94,11 +102,13 @@ def get_client(project_id=None, credentials=None, if not credentials: assert (service_account and (private_key or private_key_file)) or ( json_key or json_key_file), \ - 'Must provide AssertionCredentials or service account and P12 key or JSON key' + 'Must provide AssertionCredentials or service account and P12 key\ + or JSON key' if not project_id: assert json_key or json_key_file, \ - 'Must provide project_id unless json_key or json_key_file is provided' + 'Must provide project_id unless json_key or json_key_file is\ + provided' if service_url is None: service_url = DISCOVERY_URI @@ -266,15 +276,15 @@ def query(self, query, max_results=None, timeout=0, dry_run=False): the request times out and returns. dry_run : bool, optional If True, the query isn't actually run. A valid query will return an - empty response, while an invalid one will return the same error message - it would if it wasn't a dry run. + empty response, while an invalid one will return the same error + message it would if it wasn't a dry run. Returns ------- tuple - (job id, query results) if the query completed. If dry_run is True, job id - will be None and results will be empty if the query is valid or a ``dict`` containing - the response if invalid. + (job id, query results) if the query completed. If dry_run is True, + job id will be None and results will be empty if the query is valid + or a ``dict`` containing the response if invalid. Raises ------ @@ -356,8 +366,8 @@ def check_job(self, job_id): ------- tuple (``bool``, ``int``) Whether or not the query has completed and the - total number of rows included in the query table if it has completed - (else 0) + total number of rows included in the query table if it has + completed (else 0) """ query_reply = self.get_query_results(job_id, offset=0, limit=0) @@ -367,8 +377,8 @@ def check_job(self, job_id): def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): """Retrieve a list of rows from a query table by job id. - This method will append results from multiple pages together. If you want - to manually page through results, you can use `get_query_results` + This method will append results from multiple pages together. If you + want to manually page through results, you can use `get_query_results` method directly. Parameters @@ -389,7 +399,8 @@ def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): """ # Get query results - query_reply = self.get_query_results(job_id, offset=offset, limit=limit, timeout=timeout) + query_reply = self.get_query_results(job_id, offset=offset, + limit=limit, timeout=timeout) if not query_reply['jobComplete']: logger.warning('BigQuery job %s not complete' % job_id) raise UnfinishedQueryException() @@ -401,8 +412,9 @@ def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): # Append to records if there are multiple pages for query results while page_token and (not limit or len(records) < limit): - query_reply = self.get_query_results(job_id, offset=offset, limit=limit, - page_token=page_token, timeout=timeout) + query_reply = self.get_query_results( + job_id, offset=offset, limit=limit, page_token=page_token, + timeout=timeout) page_token = query_reply.get("pageToken") rows = query_reply.get('rows', []) records += [self._transform_row(row, schema) for row in rows] @@ -533,8 +545,7 @@ def create_table(self, dataset, table, schema, expiration_time=None): except HttpError as e: logger.error(('Cannot create table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, - e.content)) + 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: return False else: @@ -581,8 +592,7 @@ def update_table(self, dataset, table, schema): except HttpError as e: logger.error(('Cannot update table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, - e.content)) + 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: return False else: @@ -629,8 +639,7 @@ def patch_table(self, dataset, table, schema): except HttpError as e: logger.error(('Cannot patch table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, - e.content)) + 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: return False else: @@ -679,8 +688,7 @@ def create_view(self, dataset, view, query): except HttpError as e: logger.error(('Cannot create view {0}.{1}\n' - 'Http Error: {2}').format(dataset, view, - e.content)) + 'Http Error: {2}').format(dataset, view, e.content)) if self.swallow_results: return False else: @@ -716,8 +724,7 @@ def delete_table(self, dataset, table): except HttpError as e: logger.error(('Cannot delete table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, - e.content)) + 'Http Error: {2}').format(dataset, table, e.content)) if self.swallow_results: return False else: @@ -775,21 +782,23 @@ def import_data_from_uris( skip_leading_rows=None, ): """ - Imports data into a BigQuery table from cloud storage. Optional arguments that are not - specified are determined by BigQuery as described: - https://developers.google.com/bigquery/docs/reference/v2/jobs + Imports data into a BigQuery table from cloud storage. Optional + arguments that are not specified are determined by BigQuery as + described: + https://developers.google.com/bigquery/docs/reference/v2/jobs Parameters ---------- source_urls : list - A ``list`` of ``str`` objects representing the urls on cloud storage - of the form: gs://bucket/filename + A ``list`` of ``str`` objects representing the urls on cloud + storage of the form: gs://bucket/filename dataset : str String id of the dataset table : str String id of the table job : str, optional - Identifies the job (a unique job id is automatically generated if not provided) + Identifies the job (a unique job id is automatically generated if + not provided) schema : list, optional Represents the BigQuery schema source_format : str, optional @@ -925,8 +934,8 @@ def export_data_to_uris( field_delimiter=None, ): """ - Export data from a BigQuery table to cloud storage. Optional arguments that are - not specified are determined by BigQuery as described: + Export data from a BigQuery table to cloud storage. Optional arguments + that are not specified are determined by BigQuery as described: https://developers.google.com/bigquery/docs/reference/v2/jobs Parameters @@ -939,8 +948,8 @@ def export_data_to_uris( table : str String id of the table job : str, optional - String identifying the job (a unique jobid is automatically generated if - not provided) + String identifying the job (a unique jobid is automatically + generated if not provided) compression : str, optional One of the JOB_COMPRESSION_* constants destination_format : str, optional @@ -1110,8 +1119,8 @@ def wait_for_job(self, job, interval=5, timeout=60): Parameters ---------- job : Union[dict, str] - ``dict`` representing a BigQuery job resource, or a ``str`` representing - the BigQuery job id + ``dict`` representing a BigQuery job resource, or a ``str`` + representing the BigQuery job id interval : float, optional Polling interval in seconds, default = 5 timeout : float, optional @@ -1323,7 +1332,8 @@ def _parse_table_name(self, table_id): Returns ------- tuple - (year/month, app id), or (None, None) if the table id cannot be parsed. + (year/month, app id), or (None, None) if the table id cannot be + parsed. """ # Prefix date @@ -1394,9 +1404,11 @@ def _in_range(self, start_time, end_time, time): time <= start_time <= time + ONE_MONTH or \ time <= end_time <= time + ONE_MONTH - def get_query_results(self, job_id, offset=None, limit=None, page_token=None, timeout=0): - """Execute the query job indicated by the given job id. This is direct mapping to - bigquery api https://cloud.google.com/bigquery/docs/reference/v2/jobs/getQueryResults + def get_query_results(self, job_id, offset=None, limit=None, + page_token=None, timeout=0): + """Execute the query job indicated by the given job id. This is direct + mapping to bigquery api + https://cloud.google.com/bigquery/docs/reference/v2/jobs/getQueryResults Parameters ---------- @@ -1407,7 +1419,8 @@ def get_query_results(self, job_id, offset=None, limit=None, page_token=None, ti limit : int, optional The maximum number of results to retrieve. page_token : optional - Page token, returned by previous call, to request the next page of results. + Page token, returned by previous call, to request the next page of + results. timeout : float, optional Timeout in seconds @@ -1551,8 +1564,8 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, Parameters ---------- dataset_id : str - Unique ``str`` identifying the dataset with the project (the referenceID - of the dataset, not the integer id of the dataset) + Unique ``str`` identifying the dataset with the project (the + referenceID of the dataset, not the integer id of the dataset) friendly_name: str, optional A human readable name description: str, optional @@ -1581,8 +1594,8 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, else: return response except HttpError as e: - logger.error('Cannot create dataset {0}, {1}'.format(dataset_id, - e)) + logger.error( + 'Cannot create dataset {0}, {1}'.format(dataset_id, e)) if self.swallow_results: return False else: @@ -1611,10 +1624,11 @@ def delete_dataset(self, dataset_id, delete_contents=False): Parameters ---------- dataset_id : str - Unique ``str`` identifying the datset with the project (the referenceId of the dataset) + Unique ``str`` identifying the datset with the project (the + referenceId of the dataset) delete_contents : bool, optional - If True, forces the deletion of the dataset even when the dataset contains data - (Default = False) + If True, forces the deletion of the dataset even when the dataset + contains data (Default = False) Returns ------- @@ -1638,8 +1652,8 @@ def delete_dataset(self, dataset_id, delete_contents=False): else: return response except HttpError as e: - logger.error('Cannot delete dataset {0}: {1}'.format(dataset_id, - e)) + logger.error( + 'Cannot delete dataset {0}: {1}'.format(dataset_id, e)) if self.swallow_results: return False else: @@ -1654,7 +1668,8 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, Parameters ---------- dataset_id : str - Unique ``str`` identifying the dataset with the project (the referencedId of the dataset) + Unique ``str`` identifying the dataset with the project (the + referencedId of the dataset) friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional @@ -1665,8 +1680,8 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, Returns ------- Union[bool, dict] - ``bool`` indicating if the update was successful or not, or response - from BigQuery if swallow_results is set for False. + ``bool`` indicating if the update was successful or not, or + response from BigQuery if swallow_results is set for False. """ try: datasets = self.bigquery.datasets() @@ -1681,8 +1696,8 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, else: return response except HttpError as e: - logger.error('Cannot update dataset {0}: {1}'.format(dataset_id, - e)) + logger.error( + 'Cannot update dataset {0}: {1}'.format(dataset_id, e)) if self.swallow_results: return False else: @@ -1697,7 +1712,8 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, Parameters ---------- dataset_id : str - Unique string idenfitying the dataset with the project (the referenceId of the dataset) + Unique string idenfitying the dataset with the project (the + referenceId of the dataset) friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional @@ -1723,8 +1739,7 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, else: return response except HttpError as e: - logger.error('Cannot patch dataset {0}: {1}'.format(dataset_id, - e)) + logger.error('Cannot patch dataset {0}: {1}'.format(dataset_id, e)) if self.swallow_results: return False else: @@ -1732,7 +1747,8 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, def dataset_resource(self, ref_id, friendly_name=None, description=None, access=None): - """See https://developers.google.com/bigquery/docs/reference/v2/datasets#resource + """See + https://developers.google.com/bigquery/docs/reference/v2/datasets#resource Parameters ---------- @@ -1775,9 +1791,10 @@ def schema_from_record(cls, record): record : dict representing a record to be inserted into big query, where all keys are ``str`` objects (representing column names in - the record) and all values are of type ``int``, ``str``, ``unicode``, - ``float``, ``bool``, ``datetime``, or ``dict``. A ``dict`` value represents a - record, and must conform to the same restrictions as record + the record) and all values are of type ``int``, ``str``, + ``unicode``, ``float``, ``bool``, ``datetime``, or ``dict``. A + ``dict`` value represents a record, and must conform to the same + restrictions as record. Returns ------- @@ -1786,9 +1803,10 @@ def schema_from_record(cls, record): Notes ----- - Results are undefined if a different value type is provided for a repeated - field: E.g. + Results are undefined if a different value type is provided for a + repeated field: E.g. >>> { rfield: [ { x: 1}, {x: "a string"} ] } # undefined! """ + from bigquery.schema_builder import schema_from_record return schema_from_record(record) From 7da5061c3e8a51bc1a1ea35da799795b2be34b46 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Sat, 23 Apr 2016 13:56:46 -0500 Subject: [PATCH 10/78] Fix version --- bigquery/__init__.py | 2 +- bigquery/version.py | 1 + setup.py | 9 +++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 bigquery/version.py diff --git a/bigquery/__init__.py b/bigquery/__init__.py index 3a4f000..b393875 100644 --- a/bigquery/__init__.py +++ b/bigquery/__init__.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -__version__ = '1.7.0' +from .version import __version__ from .client import get_client from .client import ( diff --git a/bigquery/version.py b/bigquery/version.py new file mode 100644 index 0000000..0e1a38d --- /dev/null +++ b/bigquery/version.py @@ -0,0 +1 @@ +__version__ = '1.7.0' diff --git a/setup.py b/setup.py index acdaf5e..fc1c5de 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,17 @@ +from distutils.util import convert_path from setuptools import find_packages from setuptools import setup -from bigquery import __version__ + +ns = {} +version_path = convert_path('bigquery/version.py') +with open(version_path) as version_file: + exec(version_file.read(), ns) setup_args = dict( name='BigQuery-Python', description='Simple Python client for interacting with Google BigQuery.', url='https://github.com/tylertreat/BigQuery-Python', - version=__version__, + version=ns['__version__'], license='Apache', packages=find_packages(), include_package_data=True, From c9eb5e4cf10682b88ef60c134ce4724d1f18589c Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Sat, 23 Apr 2016 14:05:44 -0500 Subject: [PATCH 11/78] PEP8 formatting fixes --- bigquery/query_builder.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index b6f568b..8fc403f 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -15,21 +15,24 @@ def render_query(dataset, tables, select=None, conditions=None, tables : Union[dict, list] The table in `dataset` to query. select : dict, optional - The keys function as column names and the values function as options to apply to - the select field such as alias and format. For example, select['start_time'] might - have the form {'alias': 'StartTime', 'format': 'INTEGER-FORMAT_UTC_USEC'}, which would - be represented as 'SEC_TO_TIMESTAMP(INTEGER(start_time)) as StartTime' in a query. Pass - `None` to seoect all. + The keys function as column names and the values function as options to + apply to the select field such as alias and format. For example, + select['start_time'] might have the form + {'alias': 'StartTime', 'format': 'INTEGER-FORMAT_UTC_USEC'}, which + would be represented as 'SEC_TO_TIMESTAMP(INTEGER(start_time)) as + StartTime' in a query. Pass `None` to select all. conditions : list, optional - a ``list`` of ``dict`` objects to filter results by. Each dict should have the keys 'field', - 'type', and 'comparators'. The first two map to strings representing the field (e.g. 'foo') - and type (e.g. 'FLOAT'). 'comparators' maps to another ``dict`` containing the keys 'condition', - 'negate', and 'value'. If 'comparators' = {'condition': '>=', 'negate': False, 'value': 1}, this - example will be rdnered as 'foo >= FLOAT('1')' in the query. + a ``list`` of ``dict`` objects to filter results by. Each dict should + have the keys 'field', 'type', and 'comparators'. The first two map to + strings representing the field (e.g. 'foo') and type (e.g. 'FLOAT'). + 'comparators' maps to another ``dict`` containing the keys 'condition', + 'negate', and 'value'. + If 'comparators' = {'condition': '>=', 'negate': False, 'value': 1}, + this example will be rdnered as 'foo >= FLOAT('1')' in the query. ``list`` of field names to group by order_by : dict, optional - Keys = {'field', 'direction'}. `dict` should be formatted as {'field':'TimeStamp, 'direction':'desc'} - or similar + Keys = {'field', 'direction'}. `dict` should be formatted as + {'field':'TimeStamp, 'direction':'desc'} or similar Returns ------- @@ -149,7 +152,8 @@ def _render_sources(dataset, tables): tables['from_date'], tables['to_date']) except KeyError as exp: - logger.warn('Missing parameter %s in selecting sources' % (exp)) + logger.warn( + 'Missing parameter %s in selecting sources' % (exp)) else: return "FROM " + ", ".join( From bab1c997ea4b40ee84dc341a6bc10022380311e1 Mon Sep 17 00:00:00 2001 From: Takashi Nishibayashi Date: Thu, 28 Apr 2016 15:02:06 +0900 Subject: [PATCH 12/78] Add template_suffix option support --- bigquery/client.py | 9 ++++++++- bigquery/tests/test_client.py | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 9bab750..ea5c503 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1162,7 +1162,8 @@ def wait_for_job(self, job, interval=5, timeout=60): return job_resource def push_rows(self, dataset, table, rows, insert_id_key=None, - skip_invalid_rows=None, ignore_unknown_values=None): + skip_invalid_rows=None, ignore_unknown_values=None, + template_suffix=None): """Upload rows to BigQuery table. Parameters @@ -1179,6 +1180,9 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, Insert all valid rows of a request, even if invalid rows exist. ignore_unknown_values : bool, optional Accept rows that contain values that do not match the schema. + template_suffix : str, optional + Inserts the rows into an {table}{template_suffix}. + If table {table}{template_suffix} doesn't exist, create from {table}. Returns ------- @@ -1208,6 +1212,9 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, if ignore_unknown_values is not None: data['ignoreUnknownValues'] = ignore_unknown_values + if template_suffix is not None: + data['templateSuffix'] = template_suffix + try: response = table_data.insertAll( projectId=self.project_id, diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index ffd7818..39bf05b 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -2176,9 +2176,12 @@ def test_request_data_with_options(self): self.dataset, self.table, self.rows, insert_id_key='one', ignore_unknown_values=True, - skip_invalid_rows=True) + skip_invalid_rows=True, + template_suffix='20160428' + ) expected_body['ignoreUnknownValues'] = True expected_body['skipInvalidRows'] = True + expected_body['templateSuffix'] = '20160428' self.mock_table_data.insertAll.assert_called_with( projectId=self.project, datasetId=self.dataset, From d51b251242c1004017ab7839bce33e85d1f258ee Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Mon, 16 May 2016 19:26:24 +0300 Subject: [PATCH 13/78] Added get_all_tables method, returning a list with all tables within a dataset #97 --- bigquery/client.py | 68 +++++++++++++++++++++++++++-------- bigquery/tests/test_client.py | 38 ++++++++++++++++++-- 2 files changed, 89 insertions(+), 17 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index ea5c503..dff7307 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1249,8 +1249,32 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, }] } + def get_all_tables(self, dataset_id): + """Retrieve a list of tables for the dataset. + + Parameters + ---------- + dataset_id : str + The dataset to retrieve table data for. + + Returns + ------- + dict + A ``list`` with all table names + """ + tables_data = self._get_all_tables_for_dataset(dataset_id) + + tables = [] + for table in tables_data['tables']: + table_name = table.get('tableReference', {}).get('tableId') + if table_name: + tables.append(table_name) + return tables + def _get_all_tables(self, dataset_id, cache=False): - """Retrieve a list of all tables for the dataset. + """Retrieve the list of tables for dataset, that respect the formats: + * appid_YYYY_MM + * YYYY_MM_appid Parameters ---------- @@ -1272,23 +1296,39 @@ def _get_all_tables(self, dataset_id, cache=False): do_fetch = False if do_fetch: - result = self.bigquery.tables().list( - projectId=self.project_id, - datasetId=dataset_id).execute() - - page_token = result.get('nextPageToken') - while page_token: - res = self.bigquery.tables().list( - projectId=self.project_id, - datasetId=dataset_id, - pageToken=page_token - ).execute() - page_token = res.get('nextPageToken') - result['tables'] += res.get('tables', []) + result = self._get_all_tables_for_dataset(dataset_id) self.cache[dataset_id] = (datetime.now(), result) return self._parse_table_list_response(result) + def _get_all_tables_for_dataset(self, dataset_id): + """Retrieve a list of all tables for the dataset. + + Parameters + ---------- + dataset_id : str + The dataset to retrieve table names for + + Returns + ------- + dict + A ``dict`` containing tables key with all tables + """ + result = self.bigquery.tables().list( + projectId=self.project_id, + datasetId=dataset_id).execute() + + page_token = result.get('nextPageToken') + while page_token: + res = self.bigquery.tables().list( + projectId=self.project_id, + datasetId=dataset_id, + pageToken=page_token + ).execute() + page_token = res.get('nextPageToken') + result['tables'] += res.get('tables', []) + return result + def _parse_table_list_response(self, list_response): """Parse the response received from calling list on tables. diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 39bf05b..263e2d7 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1290,12 +1290,21 @@ def test_not_inside_range(self): "tableId": "appspot_6_2013_06" } }, + { + "kind": "bigquery#table", + "id": "project:dataset.table_not_matching_naming", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "table_not_matching_naming" + } + }, { "kind": "bigquery#table", "id": "bad table data" - } + }, ], - "totalItems": 8 + "totalItems": 9 } @@ -2191,7 +2200,7 @@ def test_request_data_with_options(self): class TestGetAllTables(unittest.TestCase): - def test_get_tables(self): + def test_get_all_tables(self): """Ensure get_all_tables fetches table names from BigQuery.""" mock_execute = mock.Mock() @@ -2205,6 +2214,29 @@ def test_get_tables(self): bq = client.BigQueryClient(mock_bq_service, 'project') + expected_result = [ + '2013_05_appspot', '2013_06_appspot_1', '2013_06_appspot_2', + '2013_06_appspot_3', '2013_06_appspot_4', '2013_06_appspot_5', + 'appspot_6_2013_06', 'table_not_matching_naming' + ] + + tables = bq.get_all_tables('dataset') + self.assertEquals(expected_result, tables) + + def test_get_tables(self): + """Ensure _get_all_tables fetches table names from BigQuery.""" + + mock_execute = mock.Mock() + mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE + + mock_tables = mock.Mock() + mock_tables.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.tables.return_value = mock_tables + + bq = client.BigQueryClient(mock_bq_service, 'project') + expected_result = { 'appspot-3': {'2013_06_appspot_3': 1370044800}, 'appspot-2': {'2013_06_appspot_2': 1370044800}, From 76bb8c2d8270238a243e054a2bfaf59a8819ced8 Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Mon, 16 May 2016 19:31:07 +0300 Subject: [PATCH 14/78] Updated docstring. #97 --- bigquery/client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index dff7307..4917e35 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1259,8 +1259,7 @@ def get_all_tables(self, dataset_id): Returns ------- - dict - A ``list`` with all table names + A ``list`` with all table names """ tables_data = self._get_all_tables_for_dataset(dataset_id) From 67af0041815d1a564a3f81c7a73c0218da520b6b Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Fri, 20 May 2016 13:44:50 +0300 Subject: [PATCH 15/78] Fixed KeyError when there are no tables for a dataset - returning [] in that case. --- bigquery/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 4917e35..390ff68 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1264,7 +1264,7 @@ def get_all_tables(self, dataset_id): tables_data = self._get_all_tables_for_dataset(dataset_id) tables = [] - for table in tables_data['tables']: + for table in tables_data.get('tables', []): table_name = table.get('tableReference', {}).get('tableId') if table_name: tables.append(table_name) From 5fa94427230970747211941f1ef844b8f856a8ba Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Sat, 21 May 2016 12:54:05 -0500 Subject: [PATCH 16/78] Bump version to 1.8.0 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index 0e1a38d..b280975 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.7.0' +__version__ = '1.8.0' From 3466050cb182ef5b6553d891c2cf0f3b254a57a9 Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Tue, 24 May 2016 08:37:45 +0300 Subject: [PATCH 17/78] Added limit to BigQuery. --- README.md | 3 ++- bigquery/query_builder.py | 28 +++++++++++++++++++++++++--- bigquery/version.py | 2 +- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6b4606c..7cf342a 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,8 @@ query = render_query( conditions=conditions, groupings=grouping, having=having, - order_by=order_by + order_by=order_by, + limit=47 ) job_id, _ = client.query(query) diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index 8fc403f..7362148 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -4,7 +4,7 @@ def render_query(dataset, tables, select=None, conditions=None, - groupings=None, having=None, order_by=None): + groupings=None, having=None, order_by=None, limit=None): """Render a query that will run over the given tables using the specified parameters. @@ -33,6 +33,8 @@ def render_query(dataset, tables, select=None, conditions=None, order_by : dict, optional Keys = {'field', 'direction'}. `dict` should be formatted as {'field':'TimeStamp, 'direction':'desc'} or similar + limit : int, optional + Limit the amount of data needed to be returned. Returns ------- @@ -43,13 +45,14 @@ def render_query(dataset, tables, select=None, conditions=None, if None in (dataset, tables): return None - query = "%s %s %s %s %s %s" % ( + query = "%s %s %s %s %s %s %s" % ( _render_select(select), _render_sources(dataset, tables), _render_conditions(conditions), _render_groupings(groupings), _render_having(having), - _render_order(order_by) + _render_order(order_by), + _render_limit(limit) ) return query @@ -372,3 +375,22 @@ def _render_order(order): return '' return "ORDER BY %s %s" % (", ".join(order['fields']), order['direction']) + + +def _render_limit(limit): + """Render the limit part of a query. + + Parameters + ---------- + limit : int, optional + Limit the amount of data needed to be returned. + + Returns + ------- + str + A string that represents the "limit" part of a query. + """ + if not limit: + return '' + + return "LIMIT %s" % limit diff --git a/bigquery/version.py b/bigquery/version.py index 0e1a38d..e8b6b09 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.7.0' +__version__ = '1.8.1' From a9510cf433d937b4bc7159dcd28870560b1af134 Mon Sep 17 00:00:00 2001 From: Jordan Howlett Date: Tue, 24 May 2016 17:15:49 -0400 Subject: [PATCH 18/78] Add support for UseLegacySQL boolean in order to use BigQuerys standard SQL --- bigquery/client.py | 18 ++++++++++++++++-- bigquery/tests/test_client.py | 25 +++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 390ff68..5848b7e 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -262,7 +262,7 @@ def _insert_job(self, body_object): body=body_object ).execute() - def query(self, query, max_results=None, timeout=0, dry_run=False): + def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sql=None): """Submit a query to BigQuery. Parameters @@ -278,6 +278,9 @@ def query(self, query, max_results=None, timeout=0, dry_run=False): If True, the query isn't actually run. A valid query will return an empty response, while an invalid one will return the same error message it would if it wasn't a dry run. + use_legacy_sql : bool, optional. Default True. + If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) + Returns ------- @@ -298,8 +301,12 @@ def query(self, query, max_results=None, timeout=0, dry_run=False): 'query': query, 'timeoutMs': timeout * 1000, 'dryRun': dry_run, - 'maxResults': max_results, + 'maxResults': max_results } + + if use_legacy_sql is not None: + query_data['useLegacySql'] = use_legacy_sql + return self._submit_query_job(query_data) def get_query_schema(self, job_id): @@ -1027,6 +1034,7 @@ def write_to_table( priority=None, create_disposition=None, write_disposition=None, + use_legacy_sql=None ): """ Write query result to table. If dataset or table is not provided, @@ -1055,6 +1063,9 @@ def write_to_table( One of the JOB_CREATE_* constants write_disposition : str, optional One of the JOB_WRITE_* constants + use_legacy_sql: + If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) + Returns ------- @@ -1084,6 +1095,9 @@ def write_to_table( if use_query_cache is not None: configuration['useQueryCache'] = use_query_cache + if use_legacy_sql is not None: + configuration['useLegacySql'] = use_legacy_sql + if priority: configuration['priority'] = priority diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 263e2d7..bd7d4d6 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -253,6 +253,7 @@ def test_query(self): self.assertEquals(job_id, 'spiderman') self.assertEquals(results, []) + def test_query_max_results_set(self): """Ensure that we retrieve the job id from the query and the maxResults parameter is set. @@ -418,6 +419,30 @@ def test_query_with_results(self): self.assertEquals(job_id, 'spiderman') self.assertEquals(results, [{'foo': 10}]) + def test_query_with_using_legacy_sql(self): + """Ensure that use_legacy_sql bool gets used""" + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'jobComplete': True + } + + self.mock_job_collection.query.return_value = mock_query_job + + job_id, results = self.client.query(self.query, use_legacy_sql=False) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={'query': self.query, 'timeoutMs': 0, 'dryRun': False, + 'maxResults': None, 'useLegacySql': False} + ) + self.assertEquals(job_id, 'spiderman') + self.assertEquals(results, []) + class TestGetQueryResults(unittest.TestCase): From cf6be16ca76358e7ebf075eee4557e4605cd8cbb Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Fri, 8 Jul 2016 17:17:20 +0300 Subject: [PATCH 19/78] Getting back to the BigQuery-Python version --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index e8b6b09..b280975 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.8.1' +__version__ = '1.8.0' From 8ece369d380a175f99f3e8728f7b82df6699a33e Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Sun, 10 Jul 2016 10:17:05 +0300 Subject: [PATCH 20/78] Fixed tests. #104 --- bigquery/tests/test_query_builder.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/bigquery/tests/test_query_builder.py b/bigquery/tests/test_query_builder.py index df37a3e..8b77603 100644 --- a/bigquery/tests/test_query_builder.py +++ b/bigquery/tests/test_query_builder.py @@ -399,7 +399,7 @@ def test_full_query(self): " WHERE (start_time <= INTEGER('1371566954')) AND " "(start_time >= INTEGER('1371556954')) GROUP BY " "timestamp, status HAVING (status == INTEGER('1')) " - "ORDER BY timestamp desc") + "ORDER BY timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -427,7 +427,7 @@ def test_empty_conditions(self): expected_query = ("SELECT status as status, start_time as timestamp, " "resource as url FROM " "[dataset.2013_06_appspot_1] ORDER BY " - "timestamp desc") + "timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -464,7 +464,7 @@ def test_incorrect_conditions(self): expected_query = ("SELECT status as status, start_time as timestamp, " "resource as url FROM " "[dataset.2013_06_appspot_1] ORDER BY " - "timestamp desc") + "timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -516,7 +516,7 @@ def test_multiple_condition_values(self): "INTEGER('1371556954')) AND " "((resource CONTAINS STRING('foo') AND resource " "CONTAINS STRING('baz')) AND (NOT resource CONTAINS " - "STRING('bar'))) ORDER BY timestamp desc") + "STRING('bar'))) ORDER BY timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -550,7 +550,7 @@ def test_negated_condition_value(self): expected_query = ("SELECT status as status, start_time as timestamp, " "resource as url FROM " "[dataset.2013_06_appspot_1] WHERE (NOT resource " - "CONTAINS STRING('foo')) ORDER BY timestamp desc") + "CONTAINS STRING('foo')) ORDER BY timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -593,7 +593,7 @@ def test_multiple_negated_condition_values(self): "[dataset.2013_06_appspot_1] WHERE (NOT resource " "CONTAINS STRING('foo') AND NOT resource CONTAINS " "STRING('baz') AND NOT resource CONTAINS " - "STRING('bar')) ORDER BY timestamp desc") + "STRING('bar')) ORDER BY timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -631,7 +631,7 @@ def test_empty_order(self): "resource as url FROM " "[dataset.2013_06_appspot_1] WHERE (start_time " "<= INTEGER('1371566954')) AND (start_time >= " - "INTEGER('1371556954')) ") + "INTEGER('1371556954')) ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -669,7 +669,7 @@ def test_incorrect_order(self): "resource as url FROM " "[dataset.2013_06_appspot_1] WHERE (start_time " "<= INTEGER('1371566954')) AND (start_time >= " - "INTEGER('1371556954')) ") + "INTEGER('1371556954')) ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -702,7 +702,7 @@ def test_empty_select(self): expected_query = ("SELECT * FROM [dataset.2013_06_appspot_1] " "WHERE (start_time <= INTEGER('1371566954')) AND " "(start_time >= INTEGER('1371556954')) ORDER BY " - "timestamp desc") + "timestamp desc ") self.assertEqual(result, expected_query) def test_no_alias(self): @@ -777,7 +777,7 @@ def test_formatting(self): "resource as url FROM " "[dataset.2013_06_appspot_1] WHERE (start_time " "<= INTEGER('1371566954')) AND (start_time >= " - "INTEGER('1371556954')) ORDER BY timestamp desc") + "INTEGER('1371556954')) ORDER BY timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -830,7 +830,7 @@ def test_formatting_duplicate_columns(self): "[dataset.2013_06_appspot_1] WHERE " "(start_time <= INTEGER('1371566954')) AND " "(start_time >= INTEGER('1371556954')) ORDER BY " - "timestamp desc") + "timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -874,7 +874,7 @@ def test_sec_to_micro_formatting(self): "timestamp, resource as url FROM " "[dataset.2013_06_appspot_1] WHERE (start_time " "<= INTEGER('1371566954')) AND (start_time >= " - "INTEGER('1371556954')) ORDER BY timestamp desc") + "INTEGER('1371556954')) ORDER BY timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -930,7 +930,7 @@ def test_empty_groupings(self): expected_query = ("SELECT status as status, start_time as timestamp, " "resource as url FROM " "[dataset.2013_06_appspot_1] ORDER BY " - "timestamp desc") + "timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -971,7 +971,7 @@ def test_multi_tables(self): "[dataset.2013_07_appspot_1] WHERE (start_time " "<= INTEGER('1371566954')) AND (start_time >= " "INTEGER('1371556954')) GROUP BY timestamp, status " - "ORDER BY timestamp desc") + "ORDER BY timestamp desc ") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] From e1a08ab286da7bb4827efc516e9e173770726a7b Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Sun, 10 Jul 2016 10:28:16 +0300 Subject: [PATCH 21/78] Added limit tests #104 --- bigquery/tests/test_query_builder.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/bigquery/tests/test_query_builder.py b/bigquery/tests/test_query_builder.py index 8b77603..d9381c5 100644 --- a/bigquery/tests/test_query_builder.py +++ b/bigquery/tests/test_query_builder.py @@ -340,6 +340,27 @@ def test_no_fields(self): self.assertEqual(result, "") +class TestLimit(unittest.TestCase): + + def test_with_limit(self): + """Ensure that render limit works.""" + from bigquery.query_builder \ + import _render_limit + + result = _render_limit(8) + + self.assertEqual(result, "LIMIT 8") + + def test_no_fields(self): + """Ensure that render limit can work without any arguments.""" + from bigquery.query_builder \ + import _render_limit + + result = _render_limit(None) + + self.assertEqual(result, "") + + class TestRenderQuery(unittest.TestCase): def test_full_query(self): @@ -392,14 +413,16 @@ def test_full_query(self): 'type': 'INTEGER' } ], - order_by={'fields': ['timestamp'], 'direction': 'desc'}) + order_by={'fields': ['timestamp'], 'direction': 'desc'}, + limit=10) expected_query = ("SELECT status as status, start_time as timestamp, " "resource as url FROM [dataset.2013_06_appspot_1]" " WHERE (start_time <= INTEGER('1371566954')) AND " "(start_time >= INTEGER('1371556954')) GROUP BY " "timestamp, status HAVING (status == INTEGER('1')) " - "ORDER BY timestamp desc ") + "ORDER BY timestamp desc " + "LIMIT 10") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] From 5cbb968e8cfa521aea312d1a8ea8114dbdfc5013 Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Sun, 10 Jul 2016 10:48:48 +0300 Subject: [PATCH 22/78] Added limit to test_full_query. #104 --- bigquery/tests/test_query_builder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigquery/tests/test_query_builder.py b/bigquery/tests/test_query_builder.py index d9381c5..6e9e9ee 100644 --- a/bigquery/tests/test_query_builder.py +++ b/bigquery/tests/test_query_builder.py @@ -421,8 +421,7 @@ def test_full_query(self): " WHERE (start_time <= INTEGER('1371566954')) AND " "(start_time >= INTEGER('1371556954')) GROUP BY " "timestamp, status HAVING (status == INTEGER('1')) " - "ORDER BY timestamp desc " - "LIMIT 10") + "ORDER BY timestamp desc LIMIT 10") expected_select = (expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) expected_from = expected_query[len('SELECT '):].split('FROM')[1] @@ -755,7 +754,7 @@ def test_no_alias(self): expected_query = ("SELECT status , start_time , resource FROM " "[dataset.2013_06_appspot_1] WHERE (start_time " "<= INTEGER('1371566954')) AND (start_time >= " - "INTEGER('1371556954')) ORDER BY start_time desc") + "INTEGER('1371556954')) ORDER BY start_time desc ") expected_select = (field.strip() for field in expected_query[len('SELECT '):] .split('FROM')[0].strip().split(', ')) @@ -931,7 +930,8 @@ def test_no_table_or_dataset(self): 'negate': False}], 'type': 'INTEGER'}, ], - order_by={'fields': ['timestamp'], 'direction': 'desc'}) + order_by={'fields': ['timestamp'], 'direction': 'desc'}, + limit=10) self.assertIsNone(result) From 73ded9d5b77843685498c60050ad90aa23d14612 Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Sun, 10 Jul 2016 11:46:34 +0300 Subject: [PATCH 23/78] Added method for getting project IDs. --- bigquery/client.py | 7 +++++++ bigquery/tests/test_client.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/bigquery/client.py b/bigquery/client.py index 390ff68..ca210fc 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -148,6 +148,13 @@ def get_client(project_id=None, credentials=None, return BigQueryClient(bq_service, project_id, swallow_results) +def get_project_ids(bq_service): + """Given the BigQuery service, return all project IDs.""" + projects_request = bq_service.projects().list().execute() + return [project['id'] + for project in projects_request.get('projects', [])] + + def _get_bq_service(credentials=None, service_url=None): """Construct an authorized BigQuery service object.""" diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 263e2d7..ec8a2f4 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -214,6 +214,35 @@ def test_initialize_json_key_file_without_project_id(self, mock_open, mock_build self.assertEquals(json_key['project_id'], bq_client.project_id) +class TestGetProjectIds(unittest.TestCase): + + def test_get_project_ids(self): + mock_bq_service = mock.Mock() + mock_bq_service.projects().list().execute.return_value = { + 'kind': 'bigquery#projectList', + 'projects': [ + { + 'friendlyName': 'Big Query Test', + 'id': 'big-query-test', + 'kind': 'bigquery#project', + 'numericId': '1435372465', + 'projectReference': {'projectId': 'big-query-test'} + }, + { + 'friendlyName': 'Company', + 'id': 'company', + 'kind': 'bigquery#project', + 'numericId': '4263574685796', + 'projectReference': {'projectId': 'company'} + } + ], + 'totalItems': 2 + } + + project_ids = client.get_project_ids(mock_bq_service) + self.assertEqual(project_ids, ['big-query-test', 'company']) + + class TestQuery(unittest.TestCase): def setUp(self): From 82eb0b887a13b03180b1c24f91b9c78ee0248c8c Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Sun, 10 Jul 2016 14:58:40 +0300 Subject: [PATCH 24/78] Updated BigQuery to return more information about projects. --- bigquery/client.py | 15 +++++++++++---- bigquery/tests/test_client.py | 14 +++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index ca210fc..9139ce9 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -148,11 +148,18 @@ def get_client(project_id=None, credentials=None, return BigQueryClient(bq_service, project_id, swallow_results) -def get_project_ids(bq_service): - """Given the BigQuery service, return all project IDs.""" +def get_projects(bq_service): + """Given the BigQuery service, return data about all projects.""" projects_request = bq_service.projects().list().execute() - return [project['id'] - for project in projects_request.get('projects', [])] + + projects = [] + for project in projects_request.get('projects', []): + project_data = { + 'id': project['id'], + 'name': project['friendlyName'] + } + projects.append(project_data) + return projects def _get_bq_service(credentials=None, service_url=None): diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index ec8a2f4..988eeb4 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -229,18 +229,22 @@ def test_get_project_ids(self): 'projectReference': {'projectId': 'big-query-test'} }, { - 'friendlyName': 'Company', - 'id': 'company', + 'friendlyName': 'BQ Company project', + 'id': 'bq-project', 'kind': 'bigquery#project', 'numericId': '4263574685796', - 'projectReference': {'projectId': 'company'} + 'projectReference': {'projectId': 'bq-project'} } ], 'totalItems': 2 } - project_ids = client.get_project_ids(mock_bq_service) - self.assertEqual(project_ids, ['big-query-test', 'company']) + projects = client.get_projects(mock_bq_service) + expected_projects_data = [ + {'id': 'big-query-test', 'name': 'Big Query Test'}, + {'id': 'bq-project', 'name': 'BQ Company project'} + ] + self.assertEqual(projects, expected_projects_data) class TestQuery(unittest.TestCase): From 8a030d8f5d21784c2ae854516ba0d321e0324be0 Mon Sep 17 00:00:00 2001 From: CK Date: Mon, 1 Aug 2016 11:27:15 +0530 Subject: [PATCH 25/78] allowing write_to_table to accept maximumBillingTire parameter --- bigquery/client.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index f4b5d6a..e6000d8 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1048,7 +1048,8 @@ def write_to_table( priority=None, create_disposition=None, write_disposition=None, - use_legacy_sql=None + use_legacy_sql=None, + maximum_billing_tier=None ): """ Write query result to table. If dataset or table is not provided, @@ -1077,9 +1078,10 @@ def write_to_table( One of the JOB_CREATE_* constants write_disposition : str, optional One of the JOB_WRITE_* constants - use_legacy_sql: + use_legacy_sql: bool, optional If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) - + maximum_billing_tier : integer, optional + Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. For more information, see https://cloud.google.com/bigquery/pricing#high-compute Returns ------- @@ -1106,6 +1108,9 @@ def write_to_table( if allow_large_results is not None: configuration['allowLargeResults'] = allow_large_results + if maximum_billing_tier is not None: + configuration['maximumBillingTier'] = maximum_billing_tier + if use_query_cache is not None: configuration['useQueryCache'] = use_query_cache From d1d850d68af3fbdeea661ff6ef34309c555e5801 Mon Sep 17 00:00:00 2001 From: CK Date: Wed, 3 Aug 2016 11:48:01 +0530 Subject: [PATCH 26/78] adding unittest and following pep8 standards --- bigquery/client.py | 9 ++++++-- bigquery/tests/test_client.py | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index e6000d8..d626117 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1079,9 +1079,14 @@ def write_to_table( write_disposition : str, optional One of the JOB_WRITE_* constants use_legacy_sql: bool, optional - If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) + If False, the query will use BigQuery's standard SQL + (https://cloud.google.com/bigquery/sql-reference/) maximum_billing_tier : integer, optional - Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. For more information, see https://cloud.google.com/bigquery/pricing#high-compute + Limits the billing tier for this job. Queries that have resource + usage beyond this tier will fail (without incurring a charge). If + unspecified, this will be set to your project default. For more + information, + see https://cloud.google.com/bigquery/pricing#high-compute Returns ------- diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 13695a8..e462c6b 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1117,6 +1117,7 @@ def setUp(self): self.project_id = 'project' self.dataset_id = 'dataset' self.table_id = 'table' + self.maximum_billing_tier = 1000 self.external_udf_uris = ['gs://bucket/external_udf.js'] self.use_query_cache = False self.priority = "INTERACTIVE" @@ -1162,6 +1163,44 @@ def test_write(self): self.assertEqual(result, expected_result) + def test_write_maxbilltier(self): + """ Ensure that write is working when maximumBillingTier is set""" + expected_result = { + 'status': {'state': u'RUNNING'}, + } + + body = { + "configuration": { + "query": { + "destinationTable": { + "projectId": self.project_id, + "datasetId": self.dataset_id, + "tableId": self.table_id + }, + "query": self.query, + "userDefinedFunctionResources": [{ + "resourceUri": self.external_udf_uris[0] + }], + "useQueryCache": self.use_query_cache, + "priority": self.priority, + "maximumBillingTier": self.maximum_billing_tier + } + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + result = self.client.write_to_table( + self.query, self.dataset_id, self.table_id, priority=self.priority, + external_udf_uris=self.external_udf_uris, use_query_cache=False, + maximum_billing_tier=self.maximum_billing_tier) + + self.mock_api.jobs().insert.assert_called_with( + projectId=self.project_id, + body=body + ) + + self.assertEqual(result, expected_result) + def test_write_http_error(self): """ Test write with http error""" expected_result = { From 8b2f6886e9ff38daf8027f8edb2e969f9c3f95f9 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 3 Aug 2016 16:09:07 -0700 Subject: [PATCH 27/78] Add flatten boolean option This allows for turning off flattening of query results for write_to_table, so an exact copy of (all or part of) a table can be created. --- README.md | 5 ----- bigquery/client.py | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7cf342a..2053e05 100644 --- a/README.md +++ b/README.md @@ -294,11 +294,6 @@ from bigquery import schema_from_record schema_from_record({"id":123, "posts": [{"id":123, "text": "tihs is a post"}], "username": "bob"}) ``` -# Caveats - -BigQuery [flattens](https://developers.google.com/bigquery/docs/data?hl=ja#flatten) results with repeated records, so a result might actually map to multiple rows. This means that the row count may be larger than the actual number of results because BigQuery reports the number of unrolled rows but the returned results are rolled back up. - - # Contributing Requirements to commit here: diff --git a/bigquery/client.py b/bigquery/client.py index d626117..64a93c2 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1044,6 +1044,7 @@ def write_to_table( table=None, external_udf_uris=[], allow_large_results=None, + flatten=None, use_query_cache=None, priority=None, create_disposition=None, @@ -1070,6 +1071,9 @@ def write_to_table( Storage and have .js extensions. allow_large_results : bool, optional Whether or not to allow large results + flatten : bool, optional + Whether or not to flatten nested and repeated fields + in query results use_query_cache : bool, optional Whether or not to use query cache priority : str, optional @@ -1113,6 +1117,9 @@ def write_to_table( if allow_large_results is not None: configuration['allowLargeResults'] = allow_large_results + if flatten is not None: + configuration['flattenResults'] = flatten + if maximum_billing_tier is not None: configuration['maximumBillingTier'] = maximum_billing_tier From 12a2a9d6734d1b17297f84d1e1ef468025036f04 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 3 Aug 2016 16:19:04 -0700 Subject: [PATCH 28/78] Move new arg to the end to avoid potential API breakage --- bigquery/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 64a93c2..2f879a7 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1044,13 +1044,13 @@ def write_to_table( table=None, external_udf_uris=[], allow_large_results=None, - flatten=None, use_query_cache=None, priority=None, create_disposition=None, write_disposition=None, use_legacy_sql=None, - maximum_billing_tier=None + maximum_billing_tier=None, + flatten=None ): """ Write query result to table. If dataset or table is not provided, From 6bde6044497e761c3aa313a50b6439ab2f6245a6 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 3 Aug 2016 16:21:48 -0700 Subject: [PATCH 29/78] Move parameter description too --- bigquery/client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 2f879a7..4321ac8 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1071,9 +1071,6 @@ def write_to_table( Storage and have .js extensions. allow_large_results : bool, optional Whether or not to allow large results - flatten : bool, optional - Whether or not to flatten nested and repeated fields - in query results use_query_cache : bool, optional Whether or not to use query cache priority : str, optional @@ -1091,6 +1088,9 @@ def write_to_table( unspecified, this will be set to your project default. For more information, see https://cloud.google.com/bigquery/pricing#high-compute + flatten : bool, optional + Whether or not to flatten nested and repeated fields + in query results Returns ------- From 28e94c91aaed9d409536e1c73f22f67f72160bb3 Mon Sep 17 00:00:00 2001 From: Aaron Kavlie Date: Wed, 3 Aug 2016 22:18:49 -0700 Subject: [PATCH 30/78] Add test of flatten option --- bigquery/tests/test_client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index e462c6b..4ffac9b 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1121,6 +1121,7 @@ def setUp(self): self.external_udf_uris = ['gs://bucket/external_udf.js'] self.use_query_cache = False self.priority = "INTERACTIVE" + self.flatten_results = False self.client = client.BigQueryClient(self.mock_api, self.project_id) @@ -1144,6 +1145,7 @@ def test_write(self): }], "useQueryCache": self.use_query_cache, "priority": self.priority, + "flattenResults": self.flatten_results, } } } @@ -1154,6 +1156,7 @@ def test_write(self): self.table_id, external_udf_uris=self.external_udf_uris, use_query_cache=False, + flatten=False, priority=self.priority) self.mock_api.jobs().insert.assert_called_with( From a48ce366ba3ac7ba9a924164bb322d4a88e587ce Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Mon, 12 Sep 2016 12:17:16 -0500 Subject: [PATCH 31/78] Bump version to 1.9.0 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index b280975..e5102d3 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.8.0' +__version__ = '1.9.0' From 651200e7731a99c8420a6c470a32e326c3e01832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A3=95=E5=8F=B8=20=E9=87=91=E6=B2=A2?= Date: Tue, 4 Oct 2016 19:18:40 +0900 Subject: [PATCH 32/78] Add the time partitioning --- bigquery/client.py | 8 +++++++- bigquery/tests/test_client.py | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 4321ac8..6e1fe1a 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -520,7 +520,8 @@ def get_table(self, dataset, table): return table - def create_table(self, dataset, table, schema, expiration_time=None): + def create_table(self, dataset, table, schema, + expiration_time=None, time_partitioning=False): """Create a new table in the dataset. Parameters @@ -533,6 +534,8 @@ def create_table(self, dataset, table, schema, expiration_time=None): The table schema expiration_time : float, optional The expiry time in milliseconds since the epoch. + time_partitioning : bool, optional + Create a time partitioning. Returns ------- @@ -553,6 +556,9 @@ def create_table(self, dataset, table, schema, expiration_time=None): if expiration_time is not None: body['expirationTime'] = expiration_time + if time_partitioning: + body['timePartitioning'] = "DAY" + try: table = self.bigquery.tables().insert( projectId=self.project_id, diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 4ffac9b..69b923c 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1675,6 +1675,7 @@ def setUp(self): 'datasetId': self.dataset} } self.expiration_time = 1437513693000 + self.time_partitioning = True def test_table_create_failed(self): """Ensure that if creating the table fails, False is returned, @@ -1748,6 +1749,27 @@ def test_table_create_body_with_expiration_time(self): self.mock_tables.insert.return_value.execute.assert_called_with() + def test_table_create_body_with_time_partitioning(self): + """Ensure that if time_partitioning has specified, + it passed to the body.""" + + self.mock_tables.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + self.client.create_table(self.dataset, self.table, + self.schema, + time_partitioning=self.time_partitioning) + + body = self.body.copy() + body.update({ + 'timePartitioning': "DAY" + }) + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=body) + + self.mock_tables.insert.return_value.execute.assert_called_with() + class TestUpdateTable(unittest.TestCase): From 35e979001269549d8521d1497c120d4671b4364b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A3=95=E5=8F=B8=20=E9=87=91=E6=B2=A2?= Date: Wed, 5 Oct 2016 15:10:45 +0900 Subject: [PATCH 33/78] fixed time partitioning issue --- bigquery/client.py | 2 +- bigquery/tests/test_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 6e1fe1a..0fdc98c 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -557,7 +557,7 @@ def create_table(self, dataset, table, schema, body['expirationTime'] = expiration_time if time_partitioning: - body['timePartitioning'] = "DAY" + body['timePartitioning'] = {'type': 'DAY'} try: table = self.bigquery.tables().insert( diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 69b923c..b740414 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1762,7 +1762,7 @@ def test_table_create_body_with_time_partitioning(self): body = self.body.copy() body.update({ - 'timePartitioning': "DAY" + 'timePartitioning': {'type': 'DAY'} }) self.mock_tables.insert.assert_called_with( From ee3e54f8897905b7ff2160ce88b6609b2602c5db Mon Sep 17 00:00:00 2001 From: nrfk Date: Mon, 17 Oct 2016 20:25:08 +0200 Subject: [PATCH 34/78] Add possibility to decide location (EU or US) of a dataset when creating a new dataset --- bigquery/client.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 0fdc98c..aa589d4 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1655,7 +1655,7 @@ def _raise_executing_exception_if_error(self, job): # DataSet manipulation methods # def create_dataset(self, dataset_id, friendly_name=None, description=None, - access=None): + access=None, location=None): """Create a new BigQuery dataset. Parameters @@ -1670,6 +1670,9 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, access : list, optional Indicating access permissions (see https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) + location : str, optional + Indicating where dataset should be stored: EU or US (see + https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) Returns ------- @@ -1682,7 +1685,8 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, dataset_data = self.dataset_resource(dataset_id, friendly_name=friendly_name, description=description, - access=access) + access=access, + location=location) response = datasets.insert(projectId=self.project_id, body=dataset_data).execute() @@ -1843,7 +1847,7 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, return {} def dataset_resource(self, ref_id, friendly_name=None, description=None, - access=None): + access=None, location=None): """See https://developers.google.com/bigquery/docs/reference/v2/datasets#resource @@ -1857,6 +1861,8 @@ def dataset_resource(self, ref_id, friendly_name=None, description=None, An optional description for the dataset access : list, optional Indicating access permissions + location: str, optional, 'EU' or 'US' + An optional geographical location for the dataset(EU or US) Returns ------- @@ -1875,6 +1881,8 @@ def dataset_resource(self, ref_id, friendly_name=None, description=None, data["description"] = description if access: data["access"] = access + if location: + data["location"] = location return data From a9e37c839bc23899550840a65355ec84eb446ce7 Mon Sep 17 00:00:00 2001 From: Robin Thomas Date: Wed, 26 Oct 2016 12:07:52 -0400 Subject: [PATCH 35/78] add external_udf_uris support to query(), with docstring and test coverage. Change write_to_table code to use the same list comprehension to construct userDefinedFunctionResources. --- bigquery/client.py | 23 +++++++++++++---------- bigquery/tests/test_client.py | 12 +++++++++--- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index aa589d4..927345c 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -276,7 +276,7 @@ def _insert_job(self, body_object): body=body_object ).execute() - def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sql=None): + def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sql=None, external_udf_uris=None): """Submit a query to BigQuery. Parameters @@ -294,6 +294,9 @@ def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sq message it would if it wasn't a dry run. use_legacy_sql : bool, optional. Default True. If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) + external_udf_uris : list, optional + Contains external UDF URIs. If given, URIs must be Google Cloud + Storage and have .js extensions. Returns @@ -321,6 +324,10 @@ def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sq if use_legacy_sql is not None: query_data['useLegacySql'] = use_legacy_sql + if external_udf_uris: + query_data['userDefinedFunctionResources'] = \ + [ {'resourceUri': u} for u in external_udf_uris ] + return self._submit_query_job(query_data) def get_query_schema(self, job_id): @@ -1048,7 +1055,7 @@ def write_to_table( query, dataset=None, table=None, - external_udf_uris=[], + external_udf_uris=None, allow_large_results=None, use_query_cache=None, priority=None, @@ -1073,7 +1080,7 @@ def write_to_table( table : str, optional String id of the table external_udf_uris : list, optional - Contains extternal UDF URIs. If given, URIs must be Google Cloud + Contains external UDF URIs. If given, URIs must be Google Cloud Storage and have .js extensions. allow_large_results : bool, optional Whether or not to allow large results @@ -1144,13 +1151,9 @@ def write_to_table( if write_disposition: configuration['writeDisposition'] = write_disposition - configuration['userDefinedFunctionResources'] = [] - for external_udf_uri in external_udf_uris: - configuration['userDefinedFunctionResources'].append( - { - "resourceUri": external_udf_uri - } - ) + if external_udf_uris: + configuration['userDefinedFunctionResources'] = \ + [ {'resourceUri': u} for u in external_udf_uris ] body = { "configuration": { diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index b740414..94c7d61 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -259,6 +259,7 @@ def setUp(self): self.query = 'foo' self.project_id = 'project' + self.external_udf_uris = ['gs://bucket/external_udf.js'] self.client = client.BigQueryClient(self.mock_bq_service, self.project_id) @@ -276,12 +277,17 @@ def test_query(self): self.mock_job_collection.query.return_value = mock_query_job - job_id, results = self.client.query(self.query) + job_id, results = self.client.query(self.query, external_udf_uris=self.external_udf_uris) self.mock_job_collection.query.assert_called_once_with( projectId=self.project_id, - body={'query': self.query, 'timeoutMs': 0, 'dryRun': False, - 'maxResults': None} + body={ + 'query': self.query, + 'userDefinedFunctionResources': [ {'resourceUri': u} for u in self.external_udf_uris ], + 'timeoutMs': 0, + 'dryRun': False, + 'maxResults': None + } ) self.assertEquals(job_id, 'spiderman') self.assertEquals(results, []) From c0eef0a90fcc8716e426d4a95ab1a28c9bfeeb0a Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Wed, 26 Oct 2016 14:44:49 -0500 Subject: [PATCH 36/78] Bump version to 1.10.0 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index e5102d3..52af183 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.9.0' +__version__ = '1.10.0' From 3559f5b74bc385b78ef329f77d144a805f18c958 Mon Sep 17 00:00:00 2001 From: Trenton Smith Date: Fri, 18 Nov 2016 16:20:04 -0700 Subject: [PATCH 37/78] add NullHandler to library logger --- bigquery/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 927345c..a25f6fa 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1,6 +1,6 @@ import calendar import json -from logging import getLogger +from logging import getLogger, NullHandler from collections import defaultdict from datetime import datetime, timedelta from hashlib import sha256 @@ -47,6 +47,7 @@ JOB_DESTINATION_FORMAT_CSV = JOB_FORMAT_CSV logger = getLogger(__name__) +logger.addHandler(logging.NullHandler()) def get_client(project_id=None, credentials=None, From b5a88cb59ff40e1d252566789ef627e1d731778d Mon Sep 17 00:00:00 2001 From: Trenton Smith Date: Fri, 18 Nov 2016 16:25:59 -0700 Subject: [PATCH 38/78] fix typo and add NullHandler to logger in query_builder --- bigquery/client.py | 2 +- bigquery/query_builder.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index a25f6fa..1c58ed1 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -47,7 +47,7 @@ JOB_DESTINATION_FORMAT_CSV = JOB_FORMAT_CSV logger = getLogger(__name__) -logger.addHandler(logging.NullHandler()) +logger.addHandler(NullHandler()) def get_client(project_id=None, credentials=None, diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index 7362148..1054299 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -1,6 +1,7 @@ -from logging import getLogger +from logging import getLogger, NullHandler logger = getLogger(__name__) +logger.addHandler(NullHandler()) def render_query(dataset, tables, select=None, conditions=None, From d09ef1aab7ff66c491e668b1af759d0ebb20fa19 Mon Sep 17 00:00:00 2001 From: Julio David Quintana Date: Wed, 21 Dec 2016 18:18:22 -0600 Subject: [PATCH 39/78] Add ability to choose to use legacy SQL or standard SQL when creating a view. --- bigquery/client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 1c58ed1..d669673 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -680,7 +680,7 @@ def patch_table(self, dataset, table, schema): else: return {} - def create_view(self, dataset, view, query): + def create_view(self, dataset, view, query, use_legacy_sql=None): """Create a new view in the dataset. Parameters @@ -710,6 +710,9 @@ def create_view(self, dataset, view, query): } } + if use_legacy_sql is not None: + body['view']['useLegacySql'] = use_legacy_sql + try: view = self.bigquery.tables().insert( projectId=self.project_id, From acaee133eed04f4f06315c9e9e1e66c1dcf2a4ca Mon Sep 17 00:00:00 2001 From: Julio David Quintana Date: Thu, 22 Dec 2016 10:59:26 -0600 Subject: [PATCH 40/78] Add docstring for use_legacy_sql kwarg --- bigquery/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigquery/client.py b/bigquery/client.py index d669673..7e5d92f 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -691,6 +691,9 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): The name of the view to create query : dict A query that BigQuery executes when the view is referenced. + use_legacy_sql : bool, optional + If False, the query will use BigQuery's standard SQL + (https://cloud.google.com/bigquery/sql-reference/) Returns ------- From b2e39e49c5da0be085ead109e3860c9153f361cd Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Tue, 14 Feb 2017 09:08:49 -0600 Subject: [PATCH 41/78] Bump version to 1.11.0 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index 52af183..da77e85 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.10.0' +__version__ = '1.11.0' From bcb358e2546cdfe495e55844676f101ba4619efe Mon Sep 17 00:00:00 2001 From: Ciaran Blewitt Date: Tue, 7 Mar 2017 13:34:08 +1100 Subject: [PATCH 42/78] Add support for long in schema_builder --- bigquery/schema_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/schema_builder.py b/bigquery/schema_builder.py index 575b390..f4ff8ca 100644 --- a/bigquery/schema_builder.py +++ b/bigquery/schema_builder.py @@ -126,7 +126,7 @@ def bigquery_type(o, timestamp_parser=default_timestamp_parser): """ t = type(o) - if t == int: + if t == int or t == long: return "integer" elif (t == six.binary_type and six.PY2) or t == six.text_type: if timestamp_parser and timestamp_parser(o): From df78c3512bbe21d15add1b19079a50d823e35275 Mon Sep 17 00:00:00 2001 From: Ciaran Blewitt Date: Tue, 7 Mar 2017 15:06:15 +1100 Subject: [PATCH 43/78] Replaced check with python3-friendly version --- bigquery/schema_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/schema_builder.py b/bigquery/schema_builder.py index f4ff8ca..c55429a 100644 --- a/bigquery/schema_builder.py +++ b/bigquery/schema_builder.py @@ -126,7 +126,7 @@ def bigquery_type(o, timestamp_parser=default_timestamp_parser): """ t = type(o) - if t == int or t == long: + if isinstance(t, six.integertype): return "integer" elif (t == six.binary_type and six.PY2) or t == six.text_type: if timestamp_parser and timestamp_parser(o): From d098adc35d157a7aa948693817784c189e67f60e Mon Sep 17 00:00:00 2001 From: Ciaran Blewitt Date: Tue, 7 Mar 2017 15:09:08 +1100 Subject: [PATCH 44/78] Fixed typo for six.integer_types --- bigquery/schema_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/schema_builder.py b/bigquery/schema_builder.py index c55429a..35369f6 100644 --- a/bigquery/schema_builder.py +++ b/bigquery/schema_builder.py @@ -126,7 +126,7 @@ def bigquery_type(o, timestamp_parser=default_timestamp_parser): """ t = type(o) - if isinstance(t, six.integertype): + if isinstance(t, six.integer_types): return "integer" elif (t == six.binary_type and six.PY2) or t == six.text_type: if timestamp_parser and timestamp_parser(o): From 9db30711ba7009ab2ee04ae195927c717cafc959 Mon Sep 17 00:00:00 2001 From: Ciaran Blewitt Date: Tue, 7 Mar 2017 16:18:06 +1100 Subject: [PATCH 45/78] Changed schema_builder to check in six.integer_types --- bigquery/schema_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/schema_builder.py b/bigquery/schema_builder.py index 35369f6..65027b8 100644 --- a/bigquery/schema_builder.py +++ b/bigquery/schema_builder.py @@ -126,7 +126,7 @@ def bigquery_type(o, timestamp_parser=default_timestamp_parser): """ t = type(o) - if isinstance(t, six.integer_types): + if t in six.integer_types: return "integer" elif (t == six.binary_type and six.PY2) or t == six.text_type: if timestamp_parser and timestamp_parser(o): From 3939514d0559a8d33f8b8cf647311922e0124b54 Mon Sep 17 00:00:00 2001 From: cynipe Date: Fri, 28 Apr 2017 13:54:20 +0900 Subject: [PATCH 46/78] Allow to specify nested column as insertId for push_row --- bigquery/client.py | 11 ++++++--- bigquery/tests/test_client.py | 46 +++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 7e5d92f..17a3a89 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -6,6 +6,7 @@ from hashlib import sha256 from io import StringIO from time import sleep, time +from functools import reduce import six from bigquery.errors import (BigQueryTimeoutException, JobExecutingException, @@ -1236,7 +1237,8 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, rows : list A ``list`` of rows (``dict`` objects) to add to the table insert_id_key : str, optional - Key for insertId in row + Key for insertId in row. + You can use dot separated key for nested column. skip_invalid_rows : bool, optional Insert all valid rows of a request, even if invalid rows exist. ignore_unknown_values : bool, optional @@ -1258,8 +1260,11 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, for row in rows: each_row = {} each_row["json"] = row - if insert_id_key in row: - each_row["insertId"] = row[insert_id_key] + if insert_id_key is not None: + keys = insert_id_key.split('.') + val = reduce(lambda d, key: d.get(key) if d else None, keys, row) + if val is not None: + each_row["insertId"] = val rows_data.append(each_row) data = { diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 94c7d61..1315147 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -2325,6 +2325,52 @@ def test_request_data_with_options(self): tableId=self.table, body=expected_body) + def test_insert_id_key_with_nested_column(self): + """Ensure that dot separated insert_id_key properly extracted with nested column value.""" + rows = [ + {'nested': {'col': 'nested_col1'}, 'val': 1}, + {'nested': {'col': 'nested_col2'}, 'val': 2}, + ] + expected_body = self.data.copy() + expected_body['rows'] = [ + {'insertId': 'nested_col1', 'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, + {'insertId': 'nested_col2', 'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, + ] + + self.client.push_rows(self.dataset, self.table, rows, + insert_id_key='nested.col') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + expected_body = self.data.copy() + expected_body['rows'] = [ + {'insertId': 1, 'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, + {'insertId': 2, 'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, + ] + self.client.push_rows(self.dataset, self.table, rows, + insert_id_key='val') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + expected_body = self.data.copy() + expected_body['rows'] = [ + {'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, + {'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, + ] + self.client.push_rows(self.dataset, self.table, rows, + insert_id_key='no_such.column') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + class TestGetAllTables(unittest.TestCase): From 0edf54c0cd1de51a1993173fd55cb1f4b0834124 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Fri, 5 May 2017 00:24:45 -0500 Subject: [PATCH 47/78] Bump version to 1.11.1 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index da77e85..522ba08 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.11.0' +__version__ = '1.11.1' From 8ccce603045a24bd0df49a2bf26a946ac84cba6e Mon Sep 17 00:00:00 2001 From: e271828- Date: Sat, 27 May 2017 09:27:37 -0700 Subject: [PATCH 48/78] _parse_table_name failed in the event of a name like _YYYYMMDD_ --- bigquery/client.py | 5 ++++- bigquery/tests/test_client.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 17a3a89..b8971cd 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1436,6 +1436,8 @@ def _parse_table_name(self, table_id): """Parse a table name in the form of appid_YYYY_MM or YYYY_MM_appid and return a tuple consisting of YYYY-MM and the app id. + Returns (None, None) in the event of a name like _YYYYMMDD_ + Parameters ---------- table_id : str @@ -1463,9 +1465,10 @@ def _parse_table_name(self, table_id): year_month = "-".join(attributes[-2:]) app_id = "-".join(attributes[:-2]) + # Check if date parsed correctly if year_month.count("-") == 1 and all( - [num.isdigit() for num in year_month.split('-')]): + [num.isdigit() for num in year_month.split('-')]) and len(year_month) == 7: return year_month, app_id return None, None diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 1315147..a331387 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1333,6 +1333,15 @@ def test_not_inside_range(self): "kind": "bigquery#tableList", "etag": "\"GSclnjk0zID1ucM3F-xYinOm1oE/cn58Rpu8v8pB4eoJQaiTe11lPQc\"", "tables": [ + { + "kind": "bigquery#table", + "id": "project:dataset.notanappspottable_20130515_0261", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "notanappspottable_20130515_0261" + } + }, { "kind": "bigquery#table", "id": "project:dataset.2013_05_appspot_1", @@ -2389,7 +2398,7 @@ def test_get_all_tables(self): bq = client.BigQueryClient(mock_bq_service, 'project') expected_result = [ - '2013_05_appspot', '2013_06_appspot_1', '2013_06_appspot_2', + 'notanappspottable_20130515_0261', '2013_05_appspot', '2013_06_appspot_1', '2013_06_appspot_2', '2013_06_appspot_3', '2013_06_appspot_4', '2013_06_appspot_5', 'appspot_6_2013_06', 'table_not_matching_naming' ] From 2e3509fa3ff8afc5206990fb3e347eae83fccde0 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Sun, 28 May 2017 22:20:33 -0500 Subject: [PATCH 49/78] Bump version to 1.11.2 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index 522ba08..6c371de 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.11.1' +__version__ = '1.11.2' From d9e9ac7dc6e732505c13df74fed9f40473a2515d Mon Sep 17 00:00:00 2001 From: e271828- Date: Thu, 8 Jun 2017 18:50:28 -0700 Subject: [PATCH 50/78] document get_table --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2053e05..d53d9a9 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ job_id, _ = client.query(query) # Managing Tables -The BigQuery client provides facilities to manage dataset tables, including creating, deleting, and checking the existence of tables. +The BigQuery client provides facilities to manage dataset tables, including creating, deleting, checking the existence, and getting the metadata of tables. ```python # Create a new table. @@ -150,6 +150,10 @@ deleted = client.delete_table('dataset', 'my_table') # Check if a table exists. exists = client.check_table('dataset', 'my_table') + +# Get a table's full metadata. Includes numRows, numBytes, etc. +# See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables +metadata = client.get_table('dataset', 'my_table') ``` There is also functionality for retrieving tables that are associated with a Google App Engine appspot, assuming table names are in the form of appid_YYYY_MM or YYYY_MM_appid. This allows tables between a date range to be selected and queried on. From 503b8a6553b7fc684e784abc9136baad0af17a80 Mon Sep 17 00:00:00 2001 From: tushar Date: Mon, 12 Jun 2017 18:56:38 +0200 Subject: [PATCH 51/78] updated update_table to include tableId= table --- bigquery/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigquery/client.py b/bigquery/client.py index b8971cd..db264a2 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -618,6 +618,7 @@ def update_table(self, dataset, table, schema): try: result = self.bigquery.tables().update( projectId=self.project_id, + tableId= table, datasetId=dataset, body=body ).execute() From b60256e97def3f7bd68ad53f6c21de8fcdf47d89 Mon Sep 17 00:00:00 2001 From: tushar Date: Mon, 12 Jun 2017 18:57:03 +0200 Subject: [PATCH 52/78] update update_table unit test to include tableId --- bigquery/tests/test_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index a331387..60bcc42 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1830,7 +1830,7 @@ def test_table_update_failed(self): self.client.swallow_results = True self.mock_tables.update.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) + projectId=self.project, tableId=self.table, datasetId=self.dataset, body=self.body) self.mock_tables.update.return_value.execute.assert_called_with() @@ -1856,7 +1856,7 @@ def test_table_update_success(self): self.client.swallow_results = True self.mock_tables.update.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) + projectId=self.project, tableId=self.table, datasetId=self.dataset, body=self.body) self.mock_tables.update.return_value.execute.assert_called_with() From d6744c37431d1445fd6a419625d140b289a068b9 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Tue, 13 Jun 2017 09:04:41 -0500 Subject: [PATCH 53/78] Bump version to 1.12.0 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index 6c371de..666b2f7 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.11.2' +__version__ = '1.12.0' From d18356a93d5af24cc50eed6bfd7cd3154f313947 Mon Sep 17 00:00:00 2001 From: Vishvajit Pathak Date: Thu, 3 Aug 2017 17:57:56 +0530 Subject: [PATCH 54/78] typo fixes --- bigquery/query_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index 1054299..b29d0cd 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -29,7 +29,7 @@ def render_query(dataset, tables, select=None, conditions=None, 'comparators' maps to another ``dict`` containing the keys 'condition', 'negate', and 'value'. If 'comparators' = {'condition': '>=', 'negate': False, 'value': 1}, - this example will be rdnered as 'foo >= FLOAT('1')' in the query. + this example will be rendered as 'foo >= FLOAT('1')' in the query. ``list`` of field names to group by order_by : dict, optional Keys = {'field', 'direction'}. `dict` should be formatted as @@ -170,7 +170,7 @@ def _render_conditions(conditions): Parameters ---------- conditions : list - A list of dictionay items to filter a table. + A list of dictionary items to filter a table. Returns ------- From 66c18809061eefb646d5a08d2725d2f85059fdd7 Mon Sep 17 00:00:00 2001 From: Vishvajit Pathak Date: Thu, 3 Aug 2017 18:04:50 +0530 Subject: [PATCH 55/78] typos fix --- bigquery/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index db264a2..61933e4 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -983,7 +983,7 @@ def export_data_to_uris( Parameters ---------- - destination_urls : Union[str, list] + destination_uris : Union[str, list] ``str`` or ``list`` of ``str`` objects representing the URIs on cloud storage of the form: gs://bucket/filename dataset : str From 55915c0f7134f4f245bb7da4a9c0425910dc3c66 Mon Sep 17 00:00:00 2001 From: Vishvajit Pathak Date: Thu, 3 Aug 2017 18:28:42 +0530 Subject: [PATCH 56/78] typo fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d53d9a9..8171078 100644 --- a/README.md +++ b/README.md @@ -295,7 +295,7 @@ exists = client.check_dataset('mydataset') ```python from bigquery import schema_from_record -schema_from_record({"id":123, "posts": [{"id":123, "text": "tihs is a post"}], "username": "bob"}) +schema_from_record({"id":123, "posts": [{"id":123, "text": "this is a post"}], "username": "bob"}) ``` # Contributing From b34eff532daaa53bb0192a1e2f258d5e47ced51f Mon Sep 17 00:00:00 2001 From: Vishvajit Pathak Date: Thu, 3 Aug 2017 19:01:17 +0530 Subject: [PATCH 57/78] expiration_time changed to int/double --- bigquery/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 61933e4..9af8dcb 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -541,7 +541,7 @@ def create_table(self, dataset, table, schema, The name of the table to create schema : dict The table schema - expiration_time : float, optional + expiration_time : int or double, optional The expiry time in milliseconds since the epoch. time_partitioning : bool, optional Create a time partitioning. From 67c855626a0f6e83f2724387b83c2c0440234a1f Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Mon, 31 Jul 2017 15:18:18 +0200 Subject: [PATCH 58/78] Add num_retries argument Signed-off-by: Yves Bastide --- bigquery/client.py | 56 +++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 9af8dcb..847c9fb 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -55,7 +55,8 @@ def get_client(project_id=None, credentials=None, service_url=None, service_account=None, private_key=None, private_key_file=None, json_key=None, json_key_file=None, - readonly=True, swallow_results=True): + readonly=True, swallow_results=True, + num_retries=0): """Return a singleton instance of BigQueryClient. Either AssertionCredentials or a service account and private key combination need to be provided in order to authenticate requests to BigQuery. @@ -94,6 +95,9 @@ def get_client(project_id=None, credentials=None, swallow_results : bool If set to False, then return the actual response value instead of converting to boolean. Default True. + num_retries : int, optional + The number of times to retry the request. Default 0 (no retry). + Returns ------- @@ -147,7 +151,8 @@ def get_client(project_id=None, credentials=None, bq_service = _get_bq_service(credentials=credentials, service_url=service_url) - return BigQueryClient(bq_service, project_id, swallow_results) + return BigQueryClient(bq_service, project_id, swallow_results, + num_retries) def get_projects(bq_service): @@ -185,10 +190,12 @@ def _credentials(): class BigQueryClient(object): - def __init__(self, bq_service, project_id, swallow_results=True): + def __init__(self, bq_service, project_id, swallow_results=True, + num_retries=0): self.bigquery = bq_service self.project_id = project_id self.swallow_results = swallow_results + self.num_retries = num_retries self.cache = {} def _submit_query_job(self, query_data): @@ -226,7 +233,8 @@ def _submit_query_job(self, query_data): try: query_reply = job_collection.query( - projectId=self.project_id, body=query_data).execute() + projectId=self.project_id, body=query_data).execute( + num_retries=self.num_retries) except HttpError as e: if query_data.get("dryRun", False): return None, json.loads(e.content.decode('utf8')) @@ -276,7 +284,7 @@ def _insert_job(self, body_object): return job_collection.insert( projectId=self.project_id, body=body_object - ).execute() + ).execute(num_retries=self.num_retries) def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sql=None, external_udf_uris=None): """Submit a query to BigQuery. @@ -375,7 +383,7 @@ def get_table_schema(self, dataset, table): result = self.bigquery.tables().get( projectId=self.project_id, tableId=table, - datasetId=dataset).execute() + datasetId=dataset).execute(num_retries=self.num_retries) except HttpError as e: if int(e.resp['status']) == 404: logger.warn('Table %s.%s does not exist', dataset, table) @@ -481,7 +489,8 @@ def get_dataset(self, dataset_id): """ try: dataset = self.bigquery.datasets().get( - projectId=self.project_id, datasetId=dataset_id).execute() + projectId=self.project_id, datasetId=dataset_id).execute( + num_retries=self.num_retries) except HttpError: dataset = {} @@ -523,7 +532,7 @@ def get_table(self, dataset, table): try: table = self.bigquery.tables().get( projectId=self.project_id, datasetId=dataset, - tableId=table).execute() + tableId=table).execute(num_retries=self.num_retries) except HttpError: table = {} @@ -573,7 +582,7 @@ def create_table(self, dataset, table, schema, projectId=self.project_id, datasetId=dataset, body=body - ).execute() + ).execute(num_retries=self.num_retries) if self.swallow_results: return True else: @@ -621,7 +630,7 @@ def update_table(self, dataset, table, schema): tableId= table, datasetId=dataset, body=body - ).execute() + ).execute(num_retries=self.num_retries) if self.swallow_results: return True else: @@ -668,7 +677,7 @@ def patch_table(self, dataset, table, schema): projectId=self.project_id, datasetId=dataset, body=body - ).execute() + ).execute(num_retries=self.num_retries) if self.swallow_results: return True else: @@ -723,7 +732,7 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): projectId=self.project_id, datasetId=dataset, body=body - ).execute() + ).execute(num_retries=self.num_retries) if self.swallow_results: return True else: @@ -759,7 +768,7 @@ def delete_table(self, dataset, table): projectId=self.project_id, datasetId=dataset, tableId=table - ).execute() + ).execute(num_retries=self.num_retries) if self.swallow_results: return True else: @@ -1212,7 +1221,7 @@ def wait_for_job(self, job, interval=5, timeout=60): sleep(interval) request = self.bigquery.jobs().get(projectId=self.project_id, jobId=job_id) - job_resource = request.execute() + job_resource = request.execute(num_retries=self.num_retries) self._raise_executing_exception_if_error(job_resource) complete = job_resource.get('status').get('state') == u'DONE' elapsed_time = time() - start_time @@ -1288,7 +1297,7 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, datasetId=dataset, tableId=table, body=data - ).execute() + ).execute(num_retries=self.num_retries) if response.get('insertErrors'): logger.error('BigQuery insert errors: %s' % response) @@ -1382,7 +1391,7 @@ def _get_all_tables_for_dataset(self, dataset_id): """ result = self.bigquery.tables().list( projectId=self.project_id, - datasetId=dataset_id).execute() + datasetId=dataset_id).execute(num_retries=self.num_retries) page_token = result.get('nextPageToken') while page_token: @@ -1390,7 +1399,7 @@ def _get_all_tables_for_dataset(self, dataset_id): projectId=self.project_id, datasetId=dataset_id, pageToken=page_token - ).execute() + ).execute(num_retries=self.num_retries) page_token = res.get('nextPageToken') result['tables'] += res.get('tables', []) return result @@ -1553,7 +1562,7 @@ def get_query_results(self, job_id, offset=None, limit=None, startIndex=offset, maxResults=limit, pageToken=page_token, - timeoutMs=timeout * 1000).execute() + timeoutMs=timeout * 1000).execute(num_retries=self.num_retries) def _transform_row(self, row, schema): """Apply the given schema to the given BigQuery data row. @@ -1708,7 +1717,8 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, location=location) response = datasets.insert(projectId=self.project_id, - body=dataset_data).execute() + body=dataset_data).execute( + num_retries=self.num_retries) if self.swallow_results: return True else: @@ -1732,7 +1742,7 @@ def get_datasets(self): try: datasets = self.bigquery.datasets() request = datasets.list(projectId=self.project_id) - result = request.execute() + result = request.execute(num_retries=self.num_retries) return result.get('datasets', []) except HttpError as e: logger.error("Cannot list datasets: {0}".format(e)) @@ -1766,7 +1776,7 @@ def delete_dataset(self, dataset_id, delete_contents=False): request = datasets.delete(projectId=self.project_id, datasetId=dataset_id, deleteContents=delete_contents) - response = request.execute() + response = request.execute(num_retries=self.num_retries) if self.swallow_results: return True else: @@ -1810,7 +1820,7 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, request = datasets.update(projectId=self.project_id, datasetId=dataset_id, body=body) - response = request.execute() + response = request.execute(num_retries=self.num_retries) if self.swallow_results: return True else: @@ -1853,7 +1863,7 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, description, access) request = datasets.patch(projectId=self.project_id, datasetId=dataset_id, body=body) - response = request.execute() + response = request.execute(num_retries=self.num_retries) if self.swallow_results: return True else: From 289ad25b2b415a3d43ff6b75f3b4fbf48ce61a75 Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Tue, 5 Sep 2017 15:45:31 +0200 Subject: [PATCH 59/78] Fix tests Add `num_retries=0` to `execute.assert_called_with()`. Signed-off-by: Yves Bastide --- bigquery/tests/test_client.py | 77 +++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 60bcc42..0bf5a18 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -292,7 +292,6 @@ def test_query(self): self.assertEquals(job_id, 'spiderman') self.assertEquals(results, []) - def test_query_max_results_set(self): """Ensure that we retrieve the job id from the query and the maxResults parameter is set. @@ -520,7 +519,7 @@ def test_get_response(self): projectId=self.project_id, jobId=job_id, startIndex=offset, maxResults=limit, pageToken=page_token, timeoutMs=1000) - mock_query_job.execute.assert_called_once_with() + mock_query_job.execute.assert_called_once_with(num_retries=0) self.assertEquals(actual, mock_query_reply) @@ -1485,7 +1484,8 @@ def test_table_exists(self): expected, self.client.get_table_schema(self.dataset, self.table)) self.mock_tables.get.assert_called_once_with( projectId=self.project, tableId=self.table, datasetId=self.dataset) - self.mock_tables.get.return_value.execute.assert_called_once_with() + self.mock_tables.get.return_value.execute. \ + assert_called_once_with(num_retries=0) def test_table_does_not_exist(self): """Ensure that None is returned if the table doesn't exist.""" @@ -1496,7 +1496,8 @@ def test_table_does_not_exist(self): self.client.get_table_schema(self.dataset, self.table)) self.mock_tables.get.assert_called_once_with( projectId=self.project, tableId=self.table, datasetId=self.dataset) - self.mock_tables.get.return_value.execute.assert_called_once_with() + self.mock_tables.get.return_value.execute. \ + assert_called_once_with(num_retries=0) @mock.patch('bigquery.client.BigQueryClient.get_query_results') @@ -1651,7 +1652,8 @@ def test_table_does_not_exist(self): self.mock_tables.get.assert_called_once_with( projectId=self.project, datasetId=self.dataset, tableId=self.table) - self.mock_tables.get.return_value.execute.assert_called_once_with() + self.mock_tables.get.return_value.execute. \ + assert_called_once_with(num_retries=0) def test_table_does_exist(self): """Ensure that if the table does exist, True is returned.""" @@ -1666,7 +1668,8 @@ def test_table_does_exist(self): self.mock_tables.get.assert_called_once_with( projectId=self.project, datasetId=self.dataset, tableId=self.table) - self.mock_tables.get.return_value.execute.assert_called_once_with() + self.mock_tables.get.return_value.execute. \ + assert_called_once_with(num_retries=0) class TestCreateTable(unittest.TestCase): @@ -1716,7 +1719,8 @@ def test_table_create_failed(self): self.mock_tables.insert.assert_called_with( projectId=self.project, datasetId=self.dataset, body=self.body) - self.mock_tables.insert.return_value.execute.assert_called_with() + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=0) def test_table_create_success(self): """Ensure that if creating the table succeeds, True is returned, @@ -1742,7 +1746,8 @@ def test_table_create_success(self): self.mock_tables.insert.assert_called_with( projectId=self.project, datasetId=self.dataset, body=self.body) - self.mock_tables.insert.return_value.execute.assert_called_with() + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=0) def test_table_create_body_with_expiration_time(self): """Ensure that if expiration_time has specified, @@ -1762,7 +1767,8 @@ def test_table_create_body_with_expiration_time(self): self.mock_tables.insert.assert_called_with( projectId=self.project, datasetId=self.dataset, body=body) - self.mock_tables.insert.return_value.execute.assert_called_with() + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=0) def test_table_create_body_with_time_partitioning(self): """Ensure that if time_partitioning has specified, @@ -1783,7 +1789,8 @@ def test_table_create_body_with_time_partitioning(self): self.mock_tables.insert.assert_called_with( projectId=self.project, datasetId=self.dataset, body=body) - self.mock_tables.insert.return_value.execute.assert_called_with() + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=0) class TestUpdateTable(unittest.TestCase): @@ -1830,9 +1837,11 @@ def test_table_update_failed(self): self.client.swallow_results = True self.mock_tables.update.assert_called_with( - projectId=self.project, tableId=self.table, datasetId=self.dataset, body=self.body) + projectId=self.project, tableId=self.table, datasetId=self.dataset, + body=self.body) - self.mock_tables.update.return_value.execute.assert_called_with() + self.mock_tables.update.return_value.execute. \ + assert_called_with(num_retries=0) def test_table_update_success(self): """Ensure that if updating the table succeeds, True is returned, @@ -1856,9 +1865,11 @@ def test_table_update_success(self): self.client.swallow_results = True self.mock_tables.update.assert_called_with( - projectId=self.project, tableId=self.table, datasetId=self.dataset, body=self.body) + projectId=self.project, tableId=self.table, datasetId=self.dataset, + body=self.body) - self.mock_tables.update.return_value.execute.assert_called_with() + self.mock_tables.update.return_value.execute. \ + assert_called_with(num_retries=0) class TestPatchTable(unittest.TestCase): @@ -1907,7 +1918,8 @@ def test_table_patch_failed(self): self.mock_tables.patch.assert_called_with( projectId=self.project, datasetId=self.dataset, body=self.body) - self.mock_tables.patch.return_value.execute.assert_called_with() + self.mock_tables.patch.return_value.execute. \ + assert_called_with(num_retries=0) def test_table_patch_success(self): """Ensure that if patching the table succeeds, True is returned, @@ -1933,7 +1945,8 @@ def test_table_patch_success(self): self.mock_tables.patch.assert_called_with( projectId=self.project, datasetId=self.dataset, body=self.body) - self.mock_tables.patch.return_value.execute.assert_called_with() + self.mock_tables.patch.return_value.execute. \ + assert_called_with(num_retries=0) class TestCreateView(unittest.TestCase): @@ -1978,7 +1991,8 @@ def test_view_create_failed(self): self.mock_tables.insert.assert_called_with( projectId=self.project, datasetId=self.dataset, body=self.body) - self.mock_tables.insert.return_value.execute.assert_called_with() + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=0) def test_view_create_success(self): """Ensure that if creating the table succeeds, True is returned, @@ -2004,7 +2018,8 @@ def test_view_create_success(self): self.mock_tables.insert.assert_called_with( projectId=self.project, datasetId=self.dataset, body=self.body) - self.mock_tables.insert.return_value.execute.assert_called_with() + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=0) class TestDeleteTable(unittest.TestCase): @@ -2040,7 +2055,8 @@ def test_delete_table_fail(self): self.mock_tables.delete.assert_called_with( projectId=self.project, datasetId=self.dataset, tableId=self.table) - self.mock_tables.delete.return_value.execute.assert_called_with() + self.mock_tables.delete.return_value.execute. \ + assert_called_with(num_retries=0) def test_delete_table_success(self): """Ensure that if deleting table succeeds, True is returned, @@ -2064,7 +2080,8 @@ def test_delete_table_success(self): self.mock_tables.delete.assert_called_with( projectId=self.project, datasetId=self.dataset, tableId=self.table) - self.mock_tables.delete.return_value.execute.assert_called_with() + self.mock_tables.delete.return_value.execute. \ + assert_called_with(num_retries=0) class TestParseTableListReponse(unittest.TestCase): @@ -2200,7 +2217,7 @@ def test_push_failed(self): projectId=self.project, datasetId=self.dataset, tableId=self.table, body=self.data) - execute_calls = [mock.call()] + execute_calls = [mock.call(num_retries=0)] self.mock_table_data.insertAll.return_value.execute.assert_has_calls( execute_calls) @@ -2254,7 +2271,7 @@ def test_push_exception(self): projectId=self.project, datasetId=self.dataset, tableId=self.table, body=self.data) - execute_calls = [mock.call()] + execute_calls = [mock.call(num_retries=0)] self.mock_table_data.insertAll.return_value.execute.assert_has_calls( execute_calls) @@ -2286,7 +2303,7 @@ def test_push_success(self): projectId=self.project, datasetId=self.dataset, tableId=self.table, body=self.data) - execute_calls = [mock.call()] + execute_calls = [mock.call(num_retries=0)] self.mock_table_data.insertAll.return_value.execute.assert_has_calls( execute_calls) @@ -2604,7 +2621,7 @@ def test_dataset_create_failed(self): projectId=self.project, body=self.body) self.mock_datasets.insert.return_value.execute. \ - assert_called_with() + assert_called_with(num_retries=0) def test_dataset_create_success(self): """Ensure that if creating the table fails, False is returned.""" @@ -2633,7 +2650,7 @@ def test_dataset_create_success(self): projectId=self.project, body=self.body) self.mock_datasets.insert.return_value.execute. \ - assert_called_with() + assert_called_with(num_retries=0) class TestDeleteDataset(unittest.TestCase): @@ -2669,7 +2686,7 @@ def test_delete_datasets_fail(self): self.client.swallow_results = True self.mock_datasets.delete.return_value.execute. \ - assert_called_with() + assert_called_with(num_retries=0) def test_delete_datasets_success(self): """Ensure that if deleting table succeeds, True is returned.""" @@ -2694,7 +2711,7 @@ def test_delete_datasets_success(self): deleteContents=False) self.mock_datasets.delete.return_value.execute. \ - assert_called_with() + assert_called_with(num_retries=0) def test_delete_datasets_delete_contents_success(self): """Ensure that if deleting table succeeds, True is returned.""" @@ -2719,7 +2736,7 @@ def test_delete_datasets_delete_contents_success(self): deleteContents=True) self.mock_datasets.delete.return_value.execute. \ - assert_called_with() + assert_called_with(num_retries=0) FULL_DATASET_LIST_RESPONSE = { @@ -2879,7 +2896,7 @@ def test_dataset_update_failed(self): projectId=self.project, datasetId=self.dataset, body=self.body) self.mock_datasets.update.return_value.execute. \ - assert_called_with() + assert_called_with(num_retries=0) def test_dataset_update_success(self): """Ensure that if creating the table fails, False is returned.""" @@ -2908,4 +2925,4 @@ def test_dataset_update_success(self): projectId=self.project, datasetId=self.dataset, body=self.body) self.mock_datasets.update.return_value.execute. \ - assert_called_with() + assert_called_with(num_retries=0) From 188ef7d5c3676d104fa996f2b1729b46c14d2694 Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Tue, 5 Sep 2017 15:45:47 +0200 Subject: [PATCH 60/78] Add tests with num_retries Maybe too many?.. Signed-off-by: Yves Bastide --- bigquery/tests/test_client.py | 215 ++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 0bf5a18..a5e8161 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -2926,3 +2926,218 @@ def test_dataset_update_success(self): self.mock_datasets.update.return_value.execute. \ assert_called_with(num_retries=0) + + +class TestNumRetries(unittest.TestCase): + + def setUp(self): + client._bq_client = None + + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_job_collection = mock.Mock() + self.mock_datasets = mock.Mock() + self.mock_table_data = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.mock_bq_service.jobs.return_value = self.mock_job_collection + self.mock_bq_service.datasets.return_value = self.mock_datasets + self.mock_bq_service.tabledata.return_value = self.mock_table_data + + self.project_id = 'project' + self.num_retries = 5 + self.client = client.BigQueryClient(self.mock_bq_service, + self.project_id, + num_retries=self.num_retries) + self.dataset = 'dataset' + self.project = 'project' + self.table = 'table' + self.schema = [ + {'name': 'foo', 'type': 'STRING', 'mode': 'nullable'}, + {'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'} + ] + self.friendly_name = "friendly name" + self.description = "description" + self.access = [{'userByEmail': "bob@gmail.com"}] + self.query = 'SELECT "bar" foo, "foo" bar' + self.rows = [ + {'one': 'uno', 'two': 'dos'}, {'one': 'ein', 'two': 'zwei'}, + {'two': 'kiwi'}] + self.data = { + "kind": "bigquery#tableDataInsertAllRequest", + "rows": [{'insertId': "uno", 'json': {'one': 'uno', 'two': 'dos'}}, + {'insertId': "ein", 'json': + {'one': 'ein', 'two': 'zwei'}}, + {'json': {'two': 'kiwi'}}] + } + + def test_get_response(self): + job_id = 'bar' + + mock_query_job = mock.Mock() + mock_query_reply = mock.Mock() + mock_query_job.execute.return_value = mock_query_reply + self.mock_job_collection.getQueryResults.return_value = mock_query_job + + offset = 5 + limit = 10 + page_token = "token" + timeout = 1 + + self.client.get_query_results(job_id, offset, limit, + page_token, timeout) + + mock_query_job.execute. \ + assert_called_once_with(num_retries=self.num_retries) + + def test_table_exists(self): + expected = [ + {'type': 'FLOAT', 'name': 'foo', 'mode': 'NULLABLE'}, + {'type': 'INTEGER', 'name': 'bar', 'mode': 'NULLABLE'}, + {'type': 'INTEGER', 'name': 'baz', 'mode': 'NULLABLE'}, + ] + + self.mock_tables.get.return_value.execute.return_value = \ + {'schema': {'fields': expected}} + + self.client.get_table_schema(self.dataset, self.table) + self.mock_tables.get.return_value.execute. \ + assert_called_once_with(num_retries=self.num_retries) + + def test_table_create(self): + self.mock_tables.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + self.client.create_table(self.dataset, self.table, + self.schema) + + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) + + def test_table_update(self): + self.mock_tables.update.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + self.client.update_table(self.dataset, self.table, + self.schema) + + self.mock_tables.update.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) + + def test_table_patch(self): + self.mock_tables.patch.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + self.client.patch_table(self.dataset, self.table, + self.schema) + + self.mock_tables.patch.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) + + def test_view_create(self): + body = { + 'view': {'query': self.query}, + 'tableReference': { + 'tableId': self.table, 'projectId': self.project, + 'datasetId': self.dataset + } + } + + self.mock_tables.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.create_view(self.dataset, self.table, + self.query) + + self.assertTrue(actual) + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=body) + + self.mock_tables.insert.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) + + def test_delete_table(self): + self.mock_tables.delete.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.delete_table(self.dataset, self.table) + + self.assertTrue(actual) + + self.mock_tables.delete.assert_called_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table) + + self.mock_tables.delete.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) + + def test_push(self): + self.mock_table_data.insertAll.return_value.execute.return_value = { + 'status': 'foo'} + + actual = self.client.push_rows(self.dataset, self.table, self.rows, + 'one') + + self.assertTrue(actual) + + self.mock_bq_service.tabledata.assert_called_with() + + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table, + body=self.data) + + execute_calls = [mock.call(num_retries=self.num_retries)] + self.mock_table_data.insertAll.return_value.execute.assert_has_calls( + execute_calls) + + def test_dataset_create(self): + body = { + 'datasetReference': { + 'datasetId': self.dataset, + 'projectId': self.project}, + 'friendlyName': self.friendly_name, + 'description': self.description, + 'access': self.access + } + + self.mock_datasets.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.create_dataset(self.dataset, + self.friendly_name, + self.description, + self.access) + self.assertTrue(actual) + + self.mock_datasets.insert.assert_called_with( + projectId=self.project, body=body) + + self.mock_datasets.insert.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) + + def test_delete_datasets(self): + self.mock_datasets.delete.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.delete_dataset(self.dataset) + + self.assertTrue(actual) + + self.mock_datasets.delete.assert_called_with( + projectId=self.project, datasetId=self.dataset, + deleteContents=False) + + self.mock_datasets.delete.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) + + def test_dataset_update(self): + self.mock_datasets.update.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.update_dataset(self.dataset, + self.friendly_name, + self.description, + self.access) + self.assertTrue(actual) + + self.mock_datasets.update.return_value.execute. \ + assert_called_with(num_retries=self.num_retries) From 4a569bd3b8ede0297b3973b15e550c998b03f3e9 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Wed, 20 Sep 2017 10:23:36 -0500 Subject: [PATCH 61/78] Try to fix travis --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9f422c6..1e1c28c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,7 @@ language: python +before_install: + - sudo apt-get update -q + - sudo apt-get install pypy -y install: - python setup.py develop - pip install tox @@ -7,7 +10,6 @@ notifications: email: false env: - TOXENV=py27 - - TOXENV=py33 - TOXENV=py34 - TOXENV=nightly - TOXENV=pypy From d308391cf9508f1568fb9647fd711e436a2978f4 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Mon, 2 Oct 2017 16:25:19 -0500 Subject: [PATCH 62/78] Bump version to 1.13.0 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index 666b2f7..84c54b7 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.12.0' +__version__ = '1.13.0' From 40de946cf1af7d6317666db28a5740aad42c39ea Mon Sep 17 00:00:00 2001 From: Alireza Date: Tue, 30 Jan 2018 15:09:06 +0100 Subject: [PATCH 63/78] feat: Support `IS NULL`\`IS NOT NULL` condition --- bigquery/query_builder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index b29d0cd..435bb73 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -241,6 +241,8 @@ def _render_condition(field, field_type, comparators): else: value = _render_condition_value(value, field_type) value = "(" + value + ")" + elif condition == "IS NULL" or condition == "IS NOT NULL": + return field + " " + condition elif condition == "BETWEEN": if isinstance(value, (tuple, list, set)) and len(value) == 2: value = ' AND '.join( From 2ce1b8d9deb4a35c8d2759ef38ba27859a1d3ce5 Mon Sep 17 00:00:00 2001 From: Tuan Vu Date: Wed, 14 Mar 2018 16:48:22 -0700 Subject: [PATCH 64/78] support a different project_id to run job This supports authenticate to 1 project_id but run jobs in a different project_id. --- bigquery/client.py | 271 ++++++++++++++++++++++++++++++++------------- 1 file changed, 195 insertions(+), 76 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 847c9fb..0c6377e 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -198,6 +198,26 @@ def __init__(self, bq_service, project_id, swallow_results=True, self.num_retries = num_retries self.cache = {} + def _get_project_id(self, project_id=None): + """ Get new project_id + + Default is self.project_id, which is the project client authenticate to. + A new project_id is specified when client wants to authenticate to 1 project, + but run jobs in a different project. + + Parameters + ---------- + project_id : str + BigQuery project_id + + Returns + ------- + project_id: BigQuery project_id + """ + if project_id is None: + project_id = self.project_id + return project_id + def _submit_query_job(self, query_data): """ Submit a query job to BigQuery. @@ -253,6 +273,27 @@ def _submit_query_job(self, query_data): return job_id, [self._transform_row(row, schema) for row in rows] + def _get_job_reference(self, job_id): + """ Get job reference from job_id + For more details, see: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#resource + + Parameters + ---------- + job_id: + Id of the job + + Returns + ------- + job_reference: json of job_reference + """ + job_reference = { + "projectId": self.project_id, + "jobId": job_id + } + + return job_reference + def _insert_job(self, body_object): """ Submit a job to BigQuery @@ -362,7 +403,7 @@ def get_query_schema(self, job_id): return query_reply['schema']['fields'] - def get_table_schema(self, dataset, table): + def get_table_schema(self, dataset, table, project_id=None): """Return the table schema. Parameters @@ -371,6 +412,8 @@ def get_table_schema(self, dataset, table): The dataset containing the `table`. table : str The table to get the schema for + project_id: str, optional + The project of the dataset. Returns ------- @@ -380,8 +423,9 @@ def get_table_schema(self, dataset, table): """ try: + project_id = self._get_project_id(project_id) result = self.bigquery.tables().get( - projectId=self.project_id, + projectId=project_id, tableId=table, datasetId=dataset).execute(num_retries=self.num_retries) except HttpError as e: @@ -458,29 +502,33 @@ def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): records += [self._transform_row(row, schema) for row in rows] return records[:limit] if limit else records - def check_dataset(self, dataset_id): + def check_dataset(self, dataset_id, project_id=None): """Check to see if a dataset exists. Parameters ---------- dataset_id : str Dataset unique id + project_id: str, optional + The project the dataset is in Returns ------- bool True if dataset at `dataset_id` exists, else Fasle - """ - dataset = self.get_dataset(dataset_id) + """ + dataset = self.get_dataset(dataset_id, project_id) return bool(dataset) - def get_dataset(self, dataset_id): + def get_dataset(self, dataset_id, project_id=None): """Retrieve a dataset if it exists, otherwise return an empty dict. Parameters ---------- dataset_id : str Dataset unique id + project_id: str, optional + The project the dataset is in Returns ------- @@ -488,15 +536,16 @@ def get_dataset(self, dataset_id): Contains dataset object if it exists, else empty """ try: + project_id = self._get_project_id(project_id) dataset = self.bigquery.datasets().get( - projectId=self.project_id, datasetId=dataset_id).execute( + projectId=project_id, datasetId=dataset_id).execute( num_retries=self.num_retries) except HttpError: dataset = {} return dataset - def check_table(self, dataset, table): + def check_table(self, dataset, table, project_id=None): """Check to see if a table exists. Parameters @@ -505,16 +554,18 @@ def check_table(self, dataset, table): The dataset to check table : str The name of the table + project_id: str, optional + The project the table is in Returns ------- bool True if table exists, else False """ - table = self.get_table(dataset, table) + table = self.get_table(dataset, table, project_id) return bool(table) - def get_table(self, dataset, table): + def get_table(self, dataset, table, project_id=None): """ Retrieve a table if it exists, otherwise return an empty dict. Parameters @@ -523,6 +574,8 @@ def get_table(self, dataset, table): The dataset that the table is in table : str The name of the table + project_id: str, optional + The project that the table is in Returns ------- @@ -530,15 +583,16 @@ def get_table(self, dataset, table): Containing the table object if it exists, else empty """ try: + project_id = self._get_project_id(project_id) table = self.bigquery.tables().get( - projectId=self.project_id, datasetId=dataset, + projectId=project_id, datasetId=dataset, tableId=table).execute(num_retries=self.num_retries) except HttpError: table = {} return table - def create_table(self, dataset, table, schema, + def create_table(self, dataset, table, schema, project_id=None, expiration_time=None, time_partitioning=False): """Create a new table in the dataset. @@ -550,6 +604,8 @@ def create_table(self, dataset, table, schema, The name of the table to create schema : dict The table schema + project_id: str, optional + The project to create the table in expiration_time : int or double, optional The expiry time in milliseconds since the epoch. time_partitioning : bool, optional @@ -561,12 +617,13 @@ def create_table(self, dataset, table, schema, If the table was successfully created, or response from BigQuery if swallow_results is set to False """ + project_id = self._get_project_id(project_id) body = { 'schema': {'fields': schema}, 'tableReference': { 'tableId': table, - 'projectId': self.project_id, + 'projectId': project_id, 'datasetId': dataset } } @@ -579,7 +636,7 @@ def create_table(self, dataset, table, schema, try: table = self.bigquery.tables().insert( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, body=body ).execute(num_retries=self.num_retries) @@ -589,14 +646,14 @@ def create_table(self, dataset, table, schema, return table except HttpError as e: - logger.error(('Cannot create table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) + logger.error(('Cannot create table {0}.{1}.{2}\n' + 'Http Error: {3}').format(project_id, dataset, table, e.content)) if self.swallow_results: return False else: return {} - def update_table(self, dataset, table, schema): + def update_table(self, dataset, table, schema, project_id=None): """Update an existing table in the dataset. Parameters @@ -607,6 +664,8 @@ def update_table(self, dataset, table, schema): The name of the table to update schema : dict Table schema + project_id: str, optional + The project to update the table in Returns ------- @@ -614,19 +673,20 @@ def update_table(self, dataset, table, schema): bool indicating if the table was successfully updated or not, or response from BigQuery if swallow_results is set to False. """ + project_id = self._get_project_id(project_id) body = { 'schema': {'fields': schema}, 'tableReference': { 'tableId': table, - 'projectId': self.project_id, + 'projectId': project_id, 'datasetId': dataset } } try: result = self.bigquery.tables().update( - projectId=self.project_id, + projectId=project_id, tableId= table, datasetId=dataset, body=body @@ -637,14 +697,14 @@ def update_table(self, dataset, table, schema): return result except HttpError as e: - logger.error(('Cannot update table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) + logger.error(('Cannot update table {0}.{1}.{2}\n' + 'Http Error: {3}').format(project_id, dataset, table, e.content)) if self.swallow_results: return False else: return {} - def patch_table(self, dataset, table, schema): + def patch_table(self, dataset, table, schema, project_id=None): """Patch an existing table in the dataset. Parameters @@ -655,6 +715,8 @@ def patch_table(self, dataset, table, schema): The name of the table to patch schema : dict The table schema + project_id: str, optional + The project to patch the table in Returns ------- @@ -662,19 +724,20 @@ def patch_table(self, dataset, table, schema): Bool indicating if the table was successfully patched or not, or response from BigQuery if swallow_results is set to False """ + project_id = self._get_project_id(project_id) body = { 'schema': {'fields': schema}, 'tableReference': { 'tableId': table, - 'projectId': self.project_id, + 'projectId': project_id, 'datasetId': dataset } } try: result = self.bigquery.tables().patch( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, body=body ).execute(num_retries=self.num_retries) @@ -684,14 +747,14 @@ def patch_table(self, dataset, table, schema): return result except HttpError as e: - logger.error(('Cannot patch table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) + logger.error(('Cannot patch table {0}.{1}.{2}\n' + 'Http Error: {3}').format(project_id, dataset, table, e.content)) if self.swallow_results: return False else: return {} - def create_view(self, dataset, view, query, use_legacy_sql=None): + def create_view(self, dataset, view, query, project_id=None, use_legacy_sql=None): """Create a new view in the dataset. Parameters @@ -702,6 +765,8 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): The name of the view to create query : dict A query that BigQuery executes when the view is referenced. + project_id: str, optional + The project to create the view in use_legacy_sql : bool, optional If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) @@ -712,11 +777,12 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): bool indicating if the view was successfully created or not, or response from BigQuery if swallow_results is set to False. """ + project_id = self._get_project_id(project_id) body = { 'tableReference': { 'tableId': view, - 'projectId': self.project_id, + 'projectId': project_id, 'datasetId': dataset }, 'view': { @@ -729,7 +795,7 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): try: view = self.bigquery.tables().insert( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, body=body ).execute(num_retries=self.num_retries) @@ -746,7 +812,7 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): else: return {} - def delete_table(self, dataset, table): + def delete_table(self, dataset, table, project_id=None): """Delete a table from the dataset. Parameters @@ -755,6 +821,8 @@ def delete_table(self, dataset, table): The dataset to delete the table from. table : str The name of the table to delete + project_id: str, optional + String id of the project Returns ------- @@ -764,8 +832,9 @@ def delete_table(self, dataset, table): """ try: + project_id = self._get_project_id(project_id) response = self.bigquery.tables().delete( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, tableId=table ).execute(num_retries=self.num_retries) @@ -782,7 +851,7 @@ def delete_table(self, dataset, table): else: return {} - def get_tables(self, dataset_id, app_id, start_time, end_time): + def get_tables(self, dataset_id, app_id, start_time, end_time, project_id=None): """Retrieve a list of tables that are related to the given app id and are inside the range of start and end times. @@ -796,6 +865,8 @@ def get_tables(self, dataset_id, app_id, start_time, end_time): The datetime or unix time after which records will be fetched. end_time : Union[datetime, int] The datetime or unix time up to which records will be fetched. + project_id: str, optional + String id of the project Returns ------- @@ -809,7 +880,7 @@ def get_tables(self, dataset_id, app_id, start_time, end_time): if isinstance(end_time, datetime): end_time = calendar.timegm(end_time.utctimetuple()) - every_table = self._get_all_tables(dataset_id) + every_table = self._get_all_tables(dataset_id, project_id) app_tables = every_table.get(app_id, {}) return self._filter_tables_by_time(app_tables, start_time, end_time) @@ -820,6 +891,7 @@ def import_data_from_uris( dataset, table, schema=None, + project_id=None, job=None, source_format=None, create_disposition=None, @@ -848,11 +920,13 @@ def import_data_from_uris( String id of the dataset table : str String id of the table - job : str, optional - Identifies the job (a unique job id is automatically generated if - not provided) schema : list, optional Represents the BigQuery schema + project_id: str, optional + String id of the project + job : str, optional + Identifies the job (a unique job id is automatically generated if + not provided) source_format : str, optional One of the JOB_SOURCE_FORMAT_* constants create_disposition : str, optional @@ -889,9 +963,11 @@ def import_data_from_uris( source_uris = source_uris if isinstance(source_uris, list) \ else [source_uris] + project_id = self._get_project_id(project_id) + configuration = { "destinationTable": { - "projectId": self.project_id, + "projectId": project_id, "tableId": table, "datasetId": dataset }, @@ -963,10 +1039,7 @@ def import_data_from_uris( "configuration": { 'load': configuration }, - "jobReference": { - "projectId": self.project_id, - "jobId": job - } + "jobReference": self._get_job_reference(job) } logger.debug("Creating load job %s" % body) @@ -979,6 +1052,7 @@ def export_data_to_uris( destination_uris, dataset, table, + project_id=None, job=None, compression=None, destination_format=None, @@ -999,6 +1073,8 @@ def export_data_to_uris( String id of the dataset table : str String id of the table + project_id: str, optional + String id of the project job : str, optional String identifying the job (a unique jobid is automatically generated if not provided) @@ -1024,9 +1100,11 @@ def export_data_to_uris( destination_uris = destination_uris \ if isinstance(destination_uris, list) else [destination_uris] + project_id = self._get_project_id(project_id) + configuration = { "sourceTable": { - "projectId": self.project_id, + "projectId": project_id, "tableId": table, "datasetId": dataset }, @@ -1057,10 +1135,7 @@ def export_data_to_uris( "configuration": { 'extract': configuration }, - "jobReference": { - "projectId": self.project_id, - "jobId": job - } + "jobReference": self._get_job_reference(job_id) } logger.info("Creating export job %s" % body) @@ -1073,6 +1148,7 @@ def write_to_table( query, dataset=None, table=None, + project_id=None, external_udf_uris=None, allow_large_results=None, use_query_cache=None, @@ -1097,6 +1173,8 @@ def write_to_table( String id of the dataset table : str, optional String id of the table + project_id: str, optional + String id of the project external_udf_uris : list, optional Contains external UDF URIs. If given, URIs must be Google Cloud Storage and have .js extensions. @@ -1138,9 +1216,11 @@ def write_to_table( "query": query, } + project_id = self._get_project_id(project_id) + if dataset and table: configuration['destinationTable'] = { - "projectId": self.project_id, + "projectId": project_id, "tableId": table, "datasetId": dataset } @@ -1233,7 +1313,7 @@ def wait_for_job(self, job, interval=5, timeout=60): return job_resource - def push_rows(self, dataset, table, rows, insert_id_key=None, + def push_rows(self, dataset, table, rows, project_id=None, insert_id_key=None, skip_invalid_rows=None, ignore_unknown_values=None, template_suffix=None): """Upload rows to BigQuery table. @@ -1244,6 +1324,8 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, The dataset to upload to table : str The name of the table to insert rows into + project_id: str, optional + The project to upload to rows : list A ``list`` of rows (``dict`` objects) to add to the table insert_id_key : str, optional @@ -1292,8 +1374,9 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, data['templateSuffix'] = template_suffix try: + project_id = self._get_project_id(project_id) response = table_data.insertAll( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, tableId=table, body=data @@ -1325,19 +1408,21 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, }] } - def get_all_tables(self, dataset_id): + def get_all_tables(self, dataset_id, project_id=None): """Retrieve a list of tables for the dataset. Parameters ---------- dataset_id : str The dataset to retrieve table data for. + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- A ``list`` with all table names """ - tables_data = self._get_all_tables_for_dataset(dataset_id) + tables_data = self._get_all_tables_for_dataset(dataset_id, project_id) tables = [] for table in tables_data.get('tables', []): @@ -1346,7 +1431,7 @@ def get_all_tables(self, dataset_id): tables.append(table_name) return tables - def _get_all_tables(self, dataset_id, cache=False): + def _get_all_tables(self, dataset_id, project_id=None, cache=False): """Retrieve the list of tables for dataset, that respect the formats: * appid_YYYY_MM * YYYY_MM_appid @@ -1355,6 +1440,8 @@ def _get_all_tables(self, dataset_id, cache=False): ---------- dataset_id : str The dataset to retrieve table names for + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset cache : bool, optional To use cached value or not (default False). Timeout value equals CACHE_TIMEOUT. @@ -1371,24 +1458,28 @@ def _get_all_tables(self, dataset_id, cache=False): do_fetch = False if do_fetch: - result = self._get_all_tables_for_dataset(dataset_id) + result = self._get_all_tables_for_dataset(dataset_id, project_id) self.cache[dataset_id] = (datetime.now(), result) return self._parse_table_list_response(result) - def _get_all_tables_for_dataset(self, dataset_id): + def _get_all_tables_for_dataset(self, dataset_id, project_id=None): """Retrieve a list of all tables for the dataset. Parameters ---------- dataset_id : str The dataset to retrieve table names for + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- dict A ``dict`` containing tables key with all tables """ + project_id = self._get_project_id(project_id) + result = self.bigquery.tables().list( projectId=self.project_id, datasetId=dataset_id).execute(num_retries=self.num_retries) @@ -1682,7 +1773,7 @@ def _raise_executing_exception_if_error(self, job): # # DataSet manipulation methods # - def create_dataset(self, dataset_id, friendly_name=None, description=None, + def create_dataset(self, dataset_id, project_id=None, friendly_name=None, description=None, access=None, location=None): """Create a new BigQuery dataset. @@ -1691,6 +1782,8 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, dataset_id : str Unique ``str`` identifying the dataset with the project (the referenceID of the dataset, not the integer id of the dataset) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset friendly_name: str, optional A human readable name description: str, optional @@ -1708,15 +1801,19 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, ``bool`` indicating if dataset was created or not, or response from BigQuery if swallow_results is set for False """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - dataset_data = self.dataset_resource(dataset_id, + dataset_data = self.dataset_resource(dataset_id, + project_id=project_id, friendly_name=friendly_name, description=description, access=access, - location=location) + location=location + ) - response = datasets.insert(projectId=self.project_id, + response = datasets.insert(projectId=project_id, body=dataset_data).execute( num_retries=self.num_retries) if self.swallow_results: @@ -1731,31 +1828,40 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, else: return {} - def get_datasets(self): + def get_datasets(self, project_id=None): """List all datasets in the project. + + Parameters + ---------- + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- list Dataset resources """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - request = datasets.list(projectId=self.project_id) + request = datasets.list(projectId=project_id) result = request.execute(num_retries=self.num_retries) return result.get('datasets', []) except HttpError as e: logger.error("Cannot list datasets: {0}".format(e)) return None - def delete_dataset(self, dataset_id, delete_contents=False): + def delete_dataset(self, dataset_id, project_id=None, delete_contents=False): """Delete a BigQuery dataset. Parameters ---------- dataset_id : str - Unique ``str`` identifying the datset with the project (the + Unique ``str`` identifying the dataset with the project (the referenceId of the dataset) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset delete_contents : bool, optional If True, forces the deletion of the dataset even when the dataset contains data (Default = False) @@ -1771,9 +1877,11 @@ def delete_dataset(self, dataset_id, delete_contents=False): HttpError 404 when dataset with dataset_id does not exist """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - request = datasets.delete(projectId=self.project_id, + request = datasets.delete(projectId=project_id, datasetId=dataset_id, deleteContents=delete_contents) response = request.execute(num_retries=self.num_retries) @@ -1789,7 +1897,7 @@ def delete_dataset(self, dataset_id, delete_contents=False): else: return {} - def update_dataset(self, dataset_id, friendly_name=None, description=None, + def update_dataset(self, dataset_id, project_id=None, friendly_name=None, description=None, access=None): """Updates information in an existing dataset. The update method replaces the entire dataset resource, whereas the patch method only @@ -1800,6 +1908,8 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, dataset_id : str Unique ``str`` identifying the dataset with the project (the referencedId of the dataset) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional @@ -1813,11 +1923,13 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, ``bool`` indicating if the update was successful or not, or response from BigQuery if swallow_results is set for False. """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, friendly_name, + body = self.dataset_resource(dataset_id, project_id, friendly_name, description, access) - request = datasets.update(projectId=self.project_id, + request = datasets.update(projectId=project_id, datasetId=dataset_id, body=body) response = request.execute(num_retries=self.num_retries) @@ -1833,7 +1945,7 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, else: return {} - def patch_dataset(self, dataset_id, friendly_name=None, description=None, + def patch_dataset(self, dataset_id, project_id=None, friendly_name=None, description=None, access=None): """Updates information in an existing dataset. The update method replaces the entire dataset resource, whereas the patch method only @@ -1844,6 +1956,8 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, dataset_id : str Unique string idenfitying the dataset with the project (the referenceId of the dataset) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional @@ -1857,11 +1971,13 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, ``bool`` indicating if the patch was successful or not, or response from BigQuery if swallow_results is set for False. """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, friendly_name, + body = self.dataset_resource(dataset_id, project_id, friendly_name, description, access) - request = datasets.patch(projectId=self.project_id, + request = datasets.patch(projectId=project_id, datasetId=dataset_id, body=body) response = request.execute(num_retries=self.num_retries) if self.swallow_results: @@ -1875,7 +1991,7 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, else: return {} - def dataset_resource(self, ref_id, friendly_name=None, description=None, + def dataset_resource(self, ref_id, project_id=None, friendly_name=None, description=None, access=None, location=None): """See https://developers.google.com/bigquery/docs/reference/v2/datasets#resource @@ -1884,6 +2000,8 @@ def dataset_resource(self, ref_id, friendly_name=None, description=None, ---------- ref_id : str Dataset id (the reference id, not the integer id) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset friendly_name : str, optional An optional descriptive name for the dataset description : str, optional @@ -1898,10 +2016,11 @@ def dataset_resource(self, ref_id, friendly_name=None, description=None, dict Representing BigQuery dataset resource """ + project_id = self._get_project_id(project_id) data = { "datasetReference": { "datasetId": ref_id, - "projectId": self.project_id + "projectId": project_id } } if friendly_name: From 855be4e7c0fe4744072fb542a71e2d793263be3c Mon Sep 17 00:00:00 2001 From: Tuan Vu Date: Wed, 14 Mar 2018 21:44:08 -0700 Subject: [PATCH 65/78] update client and test_client to support a different project_id to run job --- bigquery/client.py | 155 ++++++++++++++++++---------------- bigquery/tests/test_client.py | 16 ++-- 2 files changed, 90 insertions(+), 81 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 0c6377e..b9d4e51 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -421,9 +421,9 @@ def get_table_schema(self, dataset, table, project_id=None): A ``list`` of ``dict`` objects that represent the table schema. If the table doesn't exist, None is returned. """ + project_id = self._get_project_id(project_id) - try: - project_id = self._get_project_id(project_id) + try: result = self.bigquery.tables().get( projectId=project_id, tableId=table, @@ -535,8 +535,9 @@ def get_dataset(self, dataset_id, project_id=None): dict Contains dataset object if it exists, else empty """ - try: - project_id = self._get_project_id(project_id) + project_id = self._get_project_id(project_id) + + try: dataset = self.bigquery.datasets().get( projectId=project_id, datasetId=dataset_id).execute( num_retries=self.num_retries) @@ -582,8 +583,8 @@ def get_table(self, dataset, table, project_id=None): dict Containing the table object if it exists, else empty """ - try: - project_id = self._get_project_id(project_id) + project_id = self._get_project_id(project_id) + try: table = self.bigquery.tables().get( projectId=project_id, datasetId=dataset, tableId=table).execute(num_retries=self.num_retries) @@ -592,8 +593,9 @@ def get_table(self, dataset, table, project_id=None): return table - def create_table(self, dataset, table, schema, project_id=None, - expiration_time=None, time_partitioning=False): + def create_table(self, dataset, table, schema, + expiration_time=None, time_partitioning=False, + project_id=None): """Create a new table in the dataset. Parameters @@ -603,13 +605,13 @@ def create_table(self, dataset, table, schema, project_id=None, table : str The name of the table to create schema : dict - The table schema - project_id: str, optional - The project to create the table in + The table schema expiration_time : int or double, optional The expiry time in milliseconds since the epoch. time_partitioning : bool, optional Create a time partitioning. + project_id: str, optional + The project to create the table in Returns ------- @@ -754,7 +756,7 @@ def patch_table(self, dataset, table, schema, project_id=None): else: return {} - def create_view(self, dataset, view, query, project_id=None, use_legacy_sql=None): + def create_view(self, dataset, view, query, use_legacy_sql=None, project_id=None): """Create a new view in the dataset. Parameters @@ -764,12 +766,12 @@ def create_view(self, dataset, view, query, project_id=None, use_legacy_sql=None view : str The name of the view to create query : dict - A query that BigQuery executes when the view is referenced. - project_id: str, optional - The project to create the view in + A query that BigQuery executes when the view is referenced. use_legacy_sql : bool, optional If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) + project_id: str, optional + The project to create the view in Returns ------- @@ -830,9 +832,9 @@ def delete_table(self, dataset, table, project_id=None): bool indicating if the table was successfully deleted or not, or response from BigQuery if swallow_results is set for False. """ + project_id = self._get_project_id(project_id) - try: - project_id = self._get_project_id(project_id) + try: response = self.bigquery.tables().delete( projectId=project_id, datasetId=dataset, @@ -890,8 +892,7 @@ def import_data_from_uris( source_uris, dataset, table, - schema=None, - project_id=None, + schema=None, job=None, source_format=None, create_disposition=None, @@ -904,6 +905,7 @@ def import_data_from_uris( field_delimiter=None, quote=None, skip_leading_rows=None, + project_id=None, ): """ Imports data into a BigQuery table from cloud storage. Optional @@ -921,9 +923,7 @@ def import_data_from_uris( table : str String id of the table schema : list, optional - Represents the BigQuery schema - project_id: str, optional - String id of the project + Represents the BigQuery schema job : str, optional Identifies the job (a unique job id is automatically generated if not provided) @@ -949,6 +949,8 @@ def import_data_from_uris( Quote character for csv only skip_leading_rows : int, optional For csv only + project_id: str, optional + String id of the project Returns ------- @@ -1051,13 +1053,13 @@ def export_data_to_uris( self, destination_uris, dataset, - table, - project_id=None, + table, job=None, compression=None, destination_format=None, print_header=None, field_delimiter=None, + project_id=None, ): """ Export data from a BigQuery table to cloud storage. Optional arguments @@ -1072,9 +1074,7 @@ def export_data_to_uris( dataset : str String id of the dataset table : str - String id of the table - project_id: str, optional - String id of the project + String id of the table job : str, optional String identifying the job (a unique jobid is automatically generated if not provided) @@ -1086,6 +1086,8 @@ def export_data_to_uris( Whether or not to print the header field_delimiter : str, optional Character separating fields in delimited file + project_id: str, optional + String id of the project Returns ------- @@ -1135,7 +1137,7 @@ def export_data_to_uris( "configuration": { 'extract': configuration }, - "jobReference": self._get_job_reference(job_id) + "jobReference": self._get_job_reference(job) } logger.info("Creating export job %s" % body) @@ -1147,8 +1149,7 @@ def write_to_table( self, query, dataset=None, - table=None, - project_id=None, + table=None, external_udf_uris=None, allow_large_results=None, use_query_cache=None, @@ -1157,7 +1158,8 @@ def write_to_table( write_disposition=None, use_legacy_sql=None, maximum_billing_tier=None, - flatten=None + flatten=None, + project_id=None, ): """ Write query result to table. If dataset or table is not provided, @@ -1172,9 +1174,7 @@ def write_to_table( dataset : str, optional String id of the dataset table : str, optional - String id of the table - project_id: str, optional - String id of the project + String id of the table external_udf_uris : list, optional Contains external UDF URIs. If given, URIs must be Google Cloud Storage and have .js extensions. @@ -1200,6 +1200,8 @@ def write_to_table( flatten : bool, optional Whether or not to flatten nested and repeated fields in query results + project_id: str, optional + String id of the project Returns ------- @@ -1313,9 +1315,9 @@ def wait_for_job(self, job, interval=5, timeout=60): return job_resource - def push_rows(self, dataset, table, rows, project_id=None, insert_id_key=None, + def push_rows(self, dataset, table, rows, insert_id_key=None, skip_invalid_rows=None, ignore_unknown_values=None, - template_suffix=None): + template_suffix=None, project_id=None): """Upload rows to BigQuery table. Parameters @@ -1323,9 +1325,7 @@ def push_rows(self, dataset, table, rows, project_id=None, insert_id_key=None, dataset : str The dataset to upload to table : str - The name of the table to insert rows into - project_id: str, optional - The project to upload to + The name of the table to insert rows into rows : list A ``list`` of rows (``dict`` objects) to add to the table insert_id_key : str, optional @@ -1338,6 +1338,8 @@ def push_rows(self, dataset, table, rows, project_id=None, insert_id_key=None, template_suffix : str, optional Inserts the rows into an {table}{template_suffix}. If table {table}{template_suffix} doesn't exist, create from {table}. + project_id: str, optional + The project to upload to Returns ------- @@ -1345,7 +1347,7 @@ def push_rows(self, dataset, table, rows, project_id=None, insert_id_key=None, bool indicating if insert succeeded or not, or response from BigQuery if swallow_results is set for False. """ - + project_id = self._get_project_id(project_id) table_data = self.bigquery.tabledata() rows_data = [] @@ -1373,8 +1375,7 @@ def push_rows(self, dataset, table, rows, project_id=None, insert_id_key=None, if template_suffix is not None: data['templateSuffix'] = template_suffix - try: - project_id = self._get_project_id(project_id) + try: response = table_data.insertAll( projectId=project_id, datasetId=dataset, @@ -1431,7 +1432,7 @@ def get_all_tables(self, dataset_id, project_id=None): tables.append(table_name) return tables - def _get_all_tables(self, dataset_id, project_id=None, cache=False): + def _get_all_tables(self, dataset_id, cache=False, project_id=None): """Retrieve the list of tables for dataset, that respect the formats: * appid_YYYY_MM * YYYY_MM_appid @@ -1439,12 +1440,12 @@ def _get_all_tables(self, dataset_id, project_id=None, cache=False): Parameters ---------- dataset_id : str - The dataset to retrieve table names for - project_id: str - Unique ``str`` identifying the BigQuery project contains the dataset + The dataset to retrieve table names for cache : bool, optional To use cached value or not (default False). Timeout value equals CACHE_TIMEOUT. + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1773,17 +1774,15 @@ def _raise_executing_exception_if_error(self, job): # # DataSet manipulation methods # - def create_dataset(self, dataset_id, project_id=None, friendly_name=None, description=None, - access=None, location=None): + def create_dataset(self, dataset_id, friendly_name=None, description=None, + access=None, location=None, project_id=None): """Create a new BigQuery dataset. Parameters ---------- dataset_id : str Unique ``str`` identifying the dataset with the project (the - referenceID of the dataset, not the integer id of the dataset) - project_id: str - Unique ``str`` identifying the BigQuery project contains the dataset + referenceID of the dataset, not the integer id of the dataset) friendly_name: str, optional A human readable name description: str, optional @@ -1794,6 +1793,8 @@ def create_dataset(self, dataset_id, project_id=None, friendly_name=None, descri location : str, optional Indicating where dataset should be stored: EU or US (see https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1852,19 +1853,19 @@ def get_datasets(self, project_id=None): logger.error("Cannot list datasets: {0}".format(e)) return None - def delete_dataset(self, dataset_id, project_id=None, delete_contents=False): + def delete_dataset(self, dataset_id, delete_contents=False, project_id=None): """Delete a BigQuery dataset. Parameters ---------- dataset_id : str Unique ``str`` identifying the dataset with the project (the - referenceId of the dataset) - project_id: str + referenceId of the dataset) Unique ``str`` identifying the BigQuery project contains the dataset delete_contents : bool, optional If True, forces the deletion of the dataset even when the dataset contains data (Default = False) + project_id: str, optional Returns ------- @@ -1897,8 +1898,8 @@ def delete_dataset(self, dataset_id, project_id=None, delete_contents=False): else: return {} - def update_dataset(self, dataset_id, project_id=None, friendly_name=None, description=None, - access=None): + def update_dataset(self, dataset_id, friendly_name=None, description=None, + access=None, project_id=None): """Updates information in an existing dataset. The update method replaces the entire dataset resource, whereas the patch method only replaces fields that are provided in the submitted dataset resource. @@ -1907,15 +1908,15 @@ def update_dataset(self, dataset_id, project_id=None, friendly_name=None, descri ---------- dataset_id : str Unique ``str`` identifying the dataset with the project (the - referencedId of the dataset) - project_id: str - Unique ``str`` identifying the BigQuery project contains the dataset + referencedId of the dataset) friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional An optional description of the dataset. access : list, optional Indicating access permissions + project_id: str, optional + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1927,8 +1928,12 @@ def update_dataset(self, dataset_id, project_id=None, friendly_name=None, descri try: datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, project_id, friendly_name, - description, access) + body = self.dataset_resource(dataset_id, + friendly_name=friendly_name, + description=description, + access=access, + project_id=project_id) + request = datasets.update(projectId=project_id, datasetId=dataset_id, body=body) @@ -1945,8 +1950,8 @@ def update_dataset(self, dataset_id, project_id=None, friendly_name=None, descri else: return {} - def patch_dataset(self, dataset_id, project_id=None, friendly_name=None, description=None, - access=None): + def patch_dataset(self, dataset_id, friendly_name=None, description=None, + access=None, project_id=None): """Updates information in an existing dataset. The update method replaces the entire dataset resource, whereas the patch method only replaces fields that are provided in the submitted dataset resource. @@ -1955,15 +1960,15 @@ def patch_dataset(self, dataset_id, project_id=None, friendly_name=None, descrip ---------- dataset_id : str Unique string idenfitying the dataset with the project (the - referenceId of the dataset) - project_id: str - Unique ``str`` identifying the BigQuery project contains the dataset + referenceId of the dataset) friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional An optional description of the dataset. access : list, optional Indicating access permissions. + project_id: str, optional + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1975,8 +1980,11 @@ def patch_dataset(self, dataset_id, project_id=None, friendly_name=None, descrip try: datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, project_id, friendly_name, - description, access) + body = self.dataset_resource(dataset_id, + friendly_name=friendly_name, + description=description, + access=access, + project_id=project_id) request = datasets.patch(projectId=project_id, datasetId=dataset_id, body=body) response = request.execute(num_retries=self.num_retries) @@ -1991,17 +1999,15 @@ def patch_dataset(self, dataset_id, project_id=None, friendly_name=None, descrip else: return {} - def dataset_resource(self, ref_id, project_id=None, friendly_name=None, description=None, - access=None, location=None): + def dataset_resource(self, ref_id, friendly_name=None, description=None, + access=None, location=None, project_id=None): """See https://developers.google.com/bigquery/docs/reference/v2/datasets#resource Parameters ---------- ref_id : str - Dataset id (the reference id, not the integer id) - project_id: str - Unique ``str`` identifying the BigQuery project contains the dataset + Dataset id (the reference id, not the integer id) friendly_name : str, optional An optional descriptive name for the dataset description : str, optional @@ -2010,6 +2016,8 @@ def dataset_resource(self, ref_id, project_id=None, friendly_name=None, descript Indicating access permissions location: str, optional, 'EU' or 'US' An optional geographical location for the dataset(EU or US) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -2017,6 +2025,7 @@ def dataset_resource(self, ref_id, project_id=None, friendly_name=None, descript Representing BigQuery dataset resource """ project_id = self._get_project_id(project_id) + data = { "datasetReference": { "datasetId": ref_id, diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index a5e8161..b581830 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -2904,18 +2904,18 @@ def test_dataset_update_success(self): self.mock_datasets.update.return_value.execute.side_effect = [{ 'status': 'foo'}, {'status': 'bar'}] - actual = self.client.update_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) + actual = self.client.update_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) self.assertTrue(actual) self.client.swallow_results = False - actual = self.client.update_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) + actual = self.client.update_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) self.assertEqual(actual, {'status': 'bar'}) From 1617ad653e3c9bc41706b4512b12b6fb50132ac9 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Fri, 16 Mar 2018 20:27:55 -0500 Subject: [PATCH 66/78] Bump version --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index 84c54b7..e4f2ad4 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.13.0' +__version__ = '1.14.0' From 700eb9dac5c6cf0bc4fd36078d083abcf828b0bd Mon Sep 17 00:00:00 2001 From: Juan Sandoval Date: Tue, 24 Apr 2018 12:11:06 -0500 Subject: [PATCH 67/78] Remove OAuth cache discovery from google client library. (#1) This fix the warning: ImportError: file_cache is unavailable when using oauth2client >= 4.0.0 And allow us to continue using the latest OAuth library versions --- bigquery/client.py | 9 +++++-- bigquery/tests/test_client.py | 45 +++++++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index b9d4e51..d76cec7 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -175,8 +175,13 @@ def _get_bq_service(credentials=None, service_url=None): assert credentials, 'Must provide ServiceAccountCredentials' http = credentials.authorize(Http()) - service = build('bigquery', 'v2', http=http, - discoveryServiceUrl=service_url) + service = build( + 'bigquery', + 'v2', + http=http, + discoveryServiceUrl=service_url, + cache_discovery=False + ) return service diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index b581830..9af4f0c 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -67,8 +67,13 @@ def test_initialize_readonly(self, mock_build, mock_return_cred): scopes=BIGQUERY_SCOPE_READ_ONLY) self.assertTrue( mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -101,8 +106,13 @@ def test_initialize_read_write(self, mock_build, mock_return_cred): service_account, mock.ANY, scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -136,8 +146,13 @@ def test_initialize_key_file(self, mock_build, mock_return_cred): scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_p12_keyfile.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -172,8 +187,13 @@ def test_initialize_json_key_file(self, mock_open, mock_build, mock_return_cred) scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_json_keyfile_dict.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -208,8 +228,13 @@ def test_initialize_json_key_file_without_project_id(self, mock_open, mock_build scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_json_keyfile_dict.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(json_key['project_id'], bq_client.project_id) From da151c3b16bbd0a8bc757efe7221f40c8b1c6e61 Mon Sep 17 00:00:00 2001 From: rhoboro Date: Thu, 17 Jan 2019 12:19:09 +0900 Subject: [PATCH 68/78] fix get_all_tables with different project_id --- bigquery/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index d76cec7..6bfab16 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1487,7 +1487,7 @@ def _get_all_tables_for_dataset(self, dataset_id, project_id=None): project_id = self._get_project_id(project_id) result = self.bigquery.tables().list( - projectId=self.project_id, + projectId=project_id, datasetId=dataset_id).execute(num_retries=self.num_retries) page_token = result.get('nextPageToken') From fb47d0459b93646e859464a2a2313e7a5e58a059 Mon Sep 17 00:00:00 2001 From: rhoboro Date: Thu, 17 Jan 2019 14:25:34 +0900 Subject: [PATCH 69/78] fix paging too --- bigquery/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 6bfab16..537e23a 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -1493,7 +1493,7 @@ def _get_all_tables_for_dataset(self, dataset_id, project_id=None): page_token = result.get('nextPageToken') while page_token: res = self.bigquery.tables().list( - projectId=self.project_id, + projectId=project_id, datasetId=dataset_id, pageToken=page_token ).execute(num_retries=self.num_retries) From 8df1c772e93f6335f6a1e8b1db1997a8592f0951 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Wed, 16 Jan 2019 23:34:29 -0600 Subject: [PATCH 70/78] Bump version to 1.14.1 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index e4f2ad4..c162747 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.14.0' +__version__ = '1.14.1' From 24cc8c18822e1478920b3144186e8672c5f4dc22 Mon Sep 17 00:00:00 2001 From: sleepless-se Date: Thu, 14 Feb 2019 01:27:24 +0800 Subject: [PATCH 71/78] It was invalid json format. add a comma --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8171078..009f125 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ The client provides an API for inserting data into a BigQuery table. The last pa ```python # Insert data into table. rows = [ - {'one': 'ein', 'two': 'zwei'} + {'one': 'ein', 'two': 'zwei'}, {'id': 'NzAzYmRiY', 'one': 'uno', 'two': 'dos'}, {'id': 'NzAzYmRiY', 'one': 'ein', 'two': 'zwei'} # duplicate entry ] From 1491e1bdc0fb1b8a8fa8fd87255032c5834f10dc Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Tue, 30 Jul 2019 13:18:36 +0200 Subject: [PATCH 72/78] Fix client.patch_table tableId is a required argument of the patch method. Also, there's no need to pass a tableReference in the body. Signed-off-by: Yves Bastide --- bigquery/client.py | 6 +----- bigquery/tests/test_client.py | 9 ++++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 537e23a..125d048 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -735,17 +735,13 @@ def patch_table(self, dataset, table, schema, project_id=None): body = { 'schema': {'fields': schema}, - 'tableReference': { - 'tableId': table, - 'projectId': project_id, - 'datasetId': dataset - } } try: result = self.bigquery.tables().patch( projectId=project_id, datasetId=dataset, + tableId=table, body=body ).execute(num_retries=self.num_retries) if self.swallow_results: diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 9af4f0c..5d36aa9 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -1913,9 +1913,6 @@ def setUp(self): self.client = client.BigQueryClient(self.mock_bq_service, self.project) self.body = { 'schema': {'fields': self.schema}, - 'tableReference': { - 'tableId': self.table, 'projectId': self.project, - 'datasetId': self.dataset} } self.expiration_time = 1437513693000 @@ -1941,7 +1938,8 @@ def test_table_patch_failed(self): self.client.swallow_results = True self.mock_tables.patch.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) + projectId=self.project, datasetId=self.dataset, + tableId=self.table, body=self.body) self.mock_tables.patch.return_value.execute. \ assert_called_with(num_retries=0) @@ -1968,7 +1966,8 @@ def test_table_patch_success(self): self.client.swallow_results = True self.mock_tables.patch.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) + projectId=self.project, datasetId=self.dataset, + tableId=self.table, body=self.body) self.mock_tables.patch.return_value.execute. \ assert_called_with(num_retries=0) From 5cc95ba35913b68bcf19210534e41e708e7e8384 Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Tue, 30 Jul 2019 14:22:51 +0200 Subject: [PATCH 73/78] Fix Travis and tox envlist Replace python 3.3 and 3.4 with 3.5 and 3.6. Signed-off-by: Yves Bastide --- .travis.yml | 3 ++- tox.ini | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1e1c28c..ba3cdc8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ notifications: email: false env: - TOXENV=py27 - - TOXENV=py34 + - TOXENV=py35 + - TOXENV=py36 - TOXENV=nightly - TOXENV=pypy diff --git a/tox.ini b/tox.ini index ce76190..58dadc9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py33, py34, nightly, pypy +envlist = py27, py35, py36, nightly, pypy [testenv] commands = nosetests --logging-level=ERROR -a slow --with-coverage --cover-package=bigquery From 8ebf84f6310b5bfd26de18b9dce50c7f37ff9b94 Mon Sep 17 00:00:00 2001 From: Ege U Date: Tue, 10 Dec 2019 16:14:06 +0300 Subject: [PATCH 74/78] Dry runs return bytes processed, and cache hit now --- bigquery/client.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index 125d048..eedafc2 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -243,7 +243,7 @@ def _submit_query_job(self, query_data): ------- tuple job id and query results if query completed. If dry_run is True, - job id will be None and results will be empty if the query is valid + job id will be None and results will be [cacheHit and totalBytesProcessed] if the query is valid or a dict containing the response if invalid. Raises @@ -269,13 +269,17 @@ def _submit_query_job(self, query_data): schema = query_reply.get('schema', {'fields': None})['fields'] rows = query_reply.get('rows', []) job_complete = query_reply.get('jobComplete', False) + cache_hit = query_reply['cacheHit'] + total_bytes_processed = query_reply['totalBytesProcessed'] # raise exceptions if it's not an async query # and job is not completed after timeout if not job_complete and query_data.get("timeoutMs", False): logger.error('BigQuery job %s timeout' % job_id) raise BigQueryTimeoutException() - + + if query_data.get("dryRun", True): + return job_id, [cache_hit, total_bytes_processed] return job_id, [self._transform_row(row, schema) for row in rows] def _get_job_reference(self, job_id): @@ -345,8 +349,8 @@ def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sq How long to wait for the query to complete, in seconds before the request times out and returns. dry_run : bool, optional - If True, the query isn't actually run. A valid query will return an - empty response, while an invalid one will return the same error + If True, the query isn't actually run. A valid query will return + cache hit, and total bytes processed, while an invalid one will return the same error message it would if it wasn't a dry run. use_legacy_sql : bool, optional. Default True. If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) @@ -359,7 +363,7 @@ def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sq ------- tuple (job id, query results) if the query completed. If dry_run is True, - job id will be None and results will be empty if the query is valid + job id will be None and results will be [cacheHit and totalBytesProcessed] if the query is valid or a ``dict`` containing the response if invalid. Raises From 01f38be5947df8ae5a9936703181a8062c5fc48c Mon Sep 17 00:00:00 2001 From: Ege U Date: Tue, 10 Dec 2019 16:32:44 +0300 Subject: [PATCH 75/78] Rewrote the tests --- bigquery/tests/test_client.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 5d36aa9..1f2d247 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -297,7 +297,9 @@ def test_query(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, - 'jobComplete': True + 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -329,6 +331,8 @@ def test_query_max_results_set(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -357,6 +361,8 @@ def test_query_timeout_set(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -382,6 +388,8 @@ def test_sync_query_timeout(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': False, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -400,6 +408,8 @@ def test_async_query_timeout(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': False, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -409,14 +419,18 @@ def test_async_query_timeout(self): self.assertEquals(results, []) def test_query_dry_run_valid(self): - """Ensure that None and an empty list is returned from the query when + """Ensure that None and [cacheHit, totalBytesProcessed] is returned from the query when dry_run is True and the query is valid. """ mock_query_job = mock.Mock() - mock_query_job.execute.return_value = {'jobReference': {}, - 'jobComplete': True} + mock_query_job.execute.return_value = { + 'jobReference': {}, + 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 + } self.mock_job_collection.query.return_value = mock_query_job @@ -428,7 +442,7 @@ def test_query_dry_run_valid(self): 'dryRun': True} ) self.assertIsNone(job_id) - self.assertEqual([], results) + self.assertEqual([False, 0], results) def test_query_dry_run_invalid(self): """Ensure that None and a dict is returned from the query when dry_run @@ -468,6 +482,8 @@ def test_query_with_results(self): 'schema': {'fields': [{'name': 'foo', 'type': 'INTEGER'}]}, 'rows': [{'f': [{'v': 10}]}], 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -491,7 +507,9 @@ def test_query_with_using_legacy_sql(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, - 'jobComplete': True + 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -873,7 +891,7 @@ def test_json_job_body_constructed_correctly(self): body = { "jobReference": { "projectId": self.project_id, - "jobId": "job" + "jobId": "job", }, "configuration": { "load": { From 0d2c801745c48732f20c9002d2a6026995875540 Mon Sep 17 00:00:00 2001 From: Tyler Treat Date: Tue, 10 Dec 2019 17:48:32 -0600 Subject: [PATCH 76/78] Bump version to 1.15.0 --- bigquery/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery/version.py b/bigquery/version.py index c162747..1c19d78 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.14.1' +__version__ = '1.15.0' From df42f83b637fbe4a70eac200ae05ea8a5f775316 Mon Sep 17 00:00:00 2001 From: Rahul Kumar Gupta <67097571+rahulshivan05@users.noreply.github.com> Date: Thu, 1 Oct 2020 09:06:03 +0530 Subject: [PATCH 77/78] Update requirements_dev.txt --- requirements_dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 74162c3..1040dea 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,6 +1,6 @@ nose rednose -mock==1.0.1 +mock==4.0.2 coverage nose-exclude tox From 77a7b1b1f3c3cbe50ce0db20b2ebc39012fbca78 Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Wed, 24 Nov 2021 06:50:37 +1100 Subject: [PATCH 78/78] docs: fix simple typo, offical -> official There is a small typo in bigquery/client.py. Should read `official` rather than `offical`. --- bigquery/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigquery/client.py b/bigquery/client.py index eedafc2..bb4d50a 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -227,7 +227,7 @@ def _submit_query_job(self, query_data): """ Submit a query job to BigQuery. This is similar to BigQueryClient.query, but gives the user - direct access to the query method on the offical BigQuery + direct access to the query method on the official BigQuery python client. For fine-grained control over a query job, see: @@ -306,7 +306,7 @@ def _get_job_reference(self, job_id): def _insert_job(self, body_object): """ Submit a job to BigQuery - Direct proxy to the insert() method of the offical BigQuery + Direct proxy to the insert() method of the official BigQuery python client. Able to submit load, link, query, copy, or extract jobs.