diff --git a/.travis.yml b/.travis.yml index 1e1c28c..ba3cdc8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ notifications: email: false env: - TOXENV=py27 - - TOXENV=py34 + - TOXENV=py35 + - TOXENV=py36 - TOXENV=nightly - TOXENV=pypy diff --git a/README.md b/README.md index 8171078..009f125 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ The client provides an API for inserting data into a BigQuery table. The last pa ```python # Insert data into table. rows = [ - {'one': 'ein', 'two': 'zwei'} + {'one': 'ein', 'two': 'zwei'}, {'id': 'NzAzYmRiY', 'one': 'uno', 'two': 'dos'}, {'id': 'NzAzYmRiY', 'one': 'ein', 'two': 'zwei'} # duplicate entry ] diff --git a/bigquery/client.py b/bigquery/client.py index 847c9fb..bb4d50a 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -175,8 +175,13 @@ def _get_bq_service(credentials=None, service_url=None): assert credentials, 'Must provide ServiceAccountCredentials' http = credentials.authorize(Http()) - service = build('bigquery', 'v2', http=http, - discoveryServiceUrl=service_url) + service = build( + 'bigquery', + 'v2', + http=http, + discoveryServiceUrl=service_url, + cache_discovery=False + ) return service @@ -198,11 +203,31 @@ def __init__(self, bq_service, project_id, swallow_results=True, self.num_retries = num_retries self.cache = {} + def _get_project_id(self, project_id=None): + """ Get new project_id + + Default is self.project_id, which is the project client authenticate to. + A new project_id is specified when client wants to authenticate to 1 project, + but run jobs in a different project. + + Parameters + ---------- + project_id : str + BigQuery project_id + + Returns + ------- + project_id: BigQuery project_id + """ + if project_id is None: + project_id = self.project_id + return project_id + def _submit_query_job(self, query_data): """ Submit a query job to BigQuery. This is similar to BigQueryClient.query, but gives the user - direct access to the query method on the offical BigQuery + direct access to the query method on the official BigQuery python client. For fine-grained control over a query job, see: @@ -218,7 +243,7 @@ def _submit_query_job(self, query_data): ------- tuple job id and query results if query completed. If dry_run is True, - job id will be None and results will be empty if the query is valid + job id will be None and results will be [cacheHit and totalBytesProcessed] if the query is valid or a dict containing the response if invalid. Raises @@ -244,19 +269,44 @@ def _submit_query_job(self, query_data): schema = query_reply.get('schema', {'fields': None})['fields'] rows = query_reply.get('rows', []) job_complete = query_reply.get('jobComplete', False) + cache_hit = query_reply['cacheHit'] + total_bytes_processed = query_reply['totalBytesProcessed'] # raise exceptions if it's not an async query # and job is not completed after timeout if not job_complete and query_data.get("timeoutMs", False): logger.error('BigQuery job %s timeout' % job_id) raise BigQueryTimeoutException() - + + if query_data.get("dryRun", True): + return job_id, [cache_hit, total_bytes_processed] return job_id, [self._transform_row(row, schema) for row in rows] + def _get_job_reference(self, job_id): + """ Get job reference from job_id + For more details, see: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#resource + + Parameters + ---------- + job_id: + Id of the job + + Returns + ------- + job_reference: json of job_reference + """ + job_reference = { + "projectId": self.project_id, + "jobId": job_id + } + + return job_reference + def _insert_job(self, body_object): """ Submit a job to BigQuery - Direct proxy to the insert() method of the offical BigQuery + Direct proxy to the insert() method of the official BigQuery python client. Able to submit load, link, query, copy, or extract jobs. @@ -299,8 +349,8 @@ def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sq How long to wait for the query to complete, in seconds before the request times out and returns. dry_run : bool, optional - If True, the query isn't actually run. A valid query will return an - empty response, while an invalid one will return the same error + If True, the query isn't actually run. A valid query will return + cache hit, and total bytes processed, while an invalid one will return the same error message it would if it wasn't a dry run. use_legacy_sql : bool, optional. Default True. If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) @@ -313,7 +363,7 @@ def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sq ------- tuple (job id, query results) if the query completed. If dry_run is True, - job id will be None and results will be empty if the query is valid + job id will be None and results will be [cacheHit and totalBytesProcessed] if the query is valid or a ``dict`` containing the response if invalid. Raises @@ -362,7 +412,7 @@ def get_query_schema(self, job_id): return query_reply['schema']['fields'] - def get_table_schema(self, dataset, table): + def get_table_schema(self, dataset, table, project_id=None): """Return the table schema. Parameters @@ -371,6 +421,8 @@ def get_table_schema(self, dataset, table): The dataset containing the `table`. table : str The table to get the schema for + project_id: str, optional + The project of the dataset. Returns ------- @@ -378,10 +430,11 @@ def get_table_schema(self, dataset, table): A ``list`` of ``dict`` objects that represent the table schema. If the table doesn't exist, None is returned. """ + project_id = self._get_project_id(project_id) - try: + try: result = self.bigquery.tables().get( - projectId=self.project_id, + projectId=project_id, tableId=table, datasetId=dataset).execute(num_retries=self.num_retries) except HttpError as e: @@ -458,45 +511,51 @@ def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): records += [self._transform_row(row, schema) for row in rows] return records[:limit] if limit else records - def check_dataset(self, dataset_id): + def check_dataset(self, dataset_id, project_id=None): """Check to see if a dataset exists. Parameters ---------- dataset_id : str Dataset unique id + project_id: str, optional + The project the dataset is in Returns ------- bool True if dataset at `dataset_id` exists, else Fasle - """ - dataset = self.get_dataset(dataset_id) + """ + dataset = self.get_dataset(dataset_id, project_id) return bool(dataset) - def get_dataset(self, dataset_id): + def get_dataset(self, dataset_id, project_id=None): """Retrieve a dataset if it exists, otherwise return an empty dict. Parameters ---------- dataset_id : str Dataset unique id + project_id: str, optional + The project the dataset is in Returns ------- dict Contains dataset object if it exists, else empty """ - try: + project_id = self._get_project_id(project_id) + + try: dataset = self.bigquery.datasets().get( - projectId=self.project_id, datasetId=dataset_id).execute( + projectId=project_id, datasetId=dataset_id).execute( num_retries=self.num_retries) except HttpError: dataset = {} return dataset - def check_table(self, dataset, table): + def check_table(self, dataset, table, project_id=None): """Check to see if a table exists. Parameters @@ -505,16 +564,18 @@ def check_table(self, dataset, table): The dataset to check table : str The name of the table + project_id: str, optional + The project the table is in Returns ------- bool True if table exists, else False """ - table = self.get_table(dataset, table) + table = self.get_table(dataset, table, project_id) return bool(table) - def get_table(self, dataset, table): + def get_table(self, dataset, table, project_id=None): """ Retrieve a table if it exists, otherwise return an empty dict. Parameters @@ -523,15 +584,18 @@ def get_table(self, dataset, table): The dataset that the table is in table : str The name of the table + project_id: str, optional + The project that the table is in Returns ------- dict Containing the table object if it exists, else empty """ - try: + project_id = self._get_project_id(project_id) + try: table = self.bigquery.tables().get( - projectId=self.project_id, datasetId=dataset, + projectId=project_id, datasetId=dataset, tableId=table).execute(num_retries=self.num_retries) except HttpError: table = {} @@ -539,7 +603,8 @@ def get_table(self, dataset, table): return table def create_table(self, dataset, table, schema, - expiration_time=None, time_partitioning=False): + expiration_time=None, time_partitioning=False, + project_id=None): """Create a new table in the dataset. Parameters @@ -549,11 +614,13 @@ def create_table(self, dataset, table, schema, table : str The name of the table to create schema : dict - The table schema + The table schema expiration_time : int or double, optional The expiry time in milliseconds since the epoch. time_partitioning : bool, optional Create a time partitioning. + project_id: str, optional + The project to create the table in Returns ------- @@ -561,12 +628,13 @@ def create_table(self, dataset, table, schema, If the table was successfully created, or response from BigQuery if swallow_results is set to False """ + project_id = self._get_project_id(project_id) body = { 'schema': {'fields': schema}, 'tableReference': { 'tableId': table, - 'projectId': self.project_id, + 'projectId': project_id, 'datasetId': dataset } } @@ -579,7 +647,7 @@ def create_table(self, dataset, table, schema, try: table = self.bigquery.tables().insert( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, body=body ).execute(num_retries=self.num_retries) @@ -589,14 +657,14 @@ def create_table(self, dataset, table, schema, return table except HttpError as e: - logger.error(('Cannot create table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) + logger.error(('Cannot create table {0}.{1}.{2}\n' + 'Http Error: {3}').format(project_id, dataset, table, e.content)) if self.swallow_results: return False else: return {} - def update_table(self, dataset, table, schema): + def update_table(self, dataset, table, schema, project_id=None): """Update an existing table in the dataset. Parameters @@ -607,6 +675,8 @@ def update_table(self, dataset, table, schema): The name of the table to update schema : dict Table schema + project_id: str, optional + The project to update the table in Returns ------- @@ -614,19 +684,20 @@ def update_table(self, dataset, table, schema): bool indicating if the table was successfully updated or not, or response from BigQuery if swallow_results is set to False. """ + project_id = self._get_project_id(project_id) body = { 'schema': {'fields': schema}, 'tableReference': { 'tableId': table, - 'projectId': self.project_id, + 'projectId': project_id, 'datasetId': dataset } } try: result = self.bigquery.tables().update( - projectId=self.project_id, + projectId=project_id, tableId= table, datasetId=dataset, body=body @@ -637,14 +708,14 @@ def update_table(self, dataset, table, schema): return result except HttpError as e: - logger.error(('Cannot update table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) + logger.error(('Cannot update table {0}.{1}.{2}\n' + 'Http Error: {3}').format(project_id, dataset, table, e.content)) if self.swallow_results: return False else: return {} - def patch_table(self, dataset, table, schema): + def patch_table(self, dataset, table, schema, project_id=None): """Patch an existing table in the dataset. Parameters @@ -655,6 +726,8 @@ def patch_table(self, dataset, table, schema): The name of the table to patch schema : dict The table schema + project_id: str, optional + The project to patch the table in Returns ------- @@ -662,20 +735,17 @@ def patch_table(self, dataset, table, schema): Bool indicating if the table was successfully patched or not, or response from BigQuery if swallow_results is set to False """ + project_id = self._get_project_id(project_id) body = { 'schema': {'fields': schema}, - 'tableReference': { - 'tableId': table, - 'projectId': self.project_id, - 'datasetId': dataset - } } try: result = self.bigquery.tables().patch( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, + tableId=table, body=body ).execute(num_retries=self.num_retries) if self.swallow_results: @@ -684,14 +754,14 @@ def patch_table(self, dataset, table, schema): return result except HttpError as e: - logger.error(('Cannot patch table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) + logger.error(('Cannot patch table {0}.{1}.{2}\n' + 'Http Error: {3}').format(project_id, dataset, table, e.content)) if self.swallow_results: return False else: return {} - def create_view(self, dataset, view, query, use_legacy_sql=None): + def create_view(self, dataset, view, query, use_legacy_sql=None, project_id=None): """Create a new view in the dataset. Parameters @@ -701,10 +771,12 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): view : str The name of the view to create query : dict - A query that BigQuery executes when the view is referenced. + A query that BigQuery executes when the view is referenced. use_legacy_sql : bool, optional If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) + project_id: str, optional + The project to create the view in Returns ------- @@ -712,11 +784,12 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): bool indicating if the view was successfully created or not, or response from BigQuery if swallow_results is set to False. """ + project_id = self._get_project_id(project_id) body = { 'tableReference': { 'tableId': view, - 'projectId': self.project_id, + 'projectId': project_id, 'datasetId': dataset }, 'view': { @@ -729,7 +802,7 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): try: view = self.bigquery.tables().insert( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, body=body ).execute(num_retries=self.num_retries) @@ -746,7 +819,7 @@ def create_view(self, dataset, view, query, use_legacy_sql=None): else: return {} - def delete_table(self, dataset, table): + def delete_table(self, dataset, table, project_id=None): """Delete a table from the dataset. Parameters @@ -755,6 +828,8 @@ def delete_table(self, dataset, table): The dataset to delete the table from. table : str The name of the table to delete + project_id: str, optional + String id of the project Returns ------- @@ -762,10 +837,11 @@ def delete_table(self, dataset, table): bool indicating if the table was successfully deleted or not, or response from BigQuery if swallow_results is set for False. """ + project_id = self._get_project_id(project_id) - try: + try: response = self.bigquery.tables().delete( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, tableId=table ).execute(num_retries=self.num_retries) @@ -782,7 +858,7 @@ def delete_table(self, dataset, table): else: return {} - def get_tables(self, dataset_id, app_id, start_time, end_time): + def get_tables(self, dataset_id, app_id, start_time, end_time, project_id=None): """Retrieve a list of tables that are related to the given app id and are inside the range of start and end times. @@ -796,6 +872,8 @@ def get_tables(self, dataset_id, app_id, start_time, end_time): The datetime or unix time after which records will be fetched. end_time : Union[datetime, int] The datetime or unix time up to which records will be fetched. + project_id: str, optional + String id of the project Returns ------- @@ -809,7 +887,7 @@ def get_tables(self, dataset_id, app_id, start_time, end_time): if isinstance(end_time, datetime): end_time = calendar.timegm(end_time.utctimetuple()) - every_table = self._get_all_tables(dataset_id) + every_table = self._get_all_tables(dataset_id, project_id) app_tables = every_table.get(app_id, {}) return self._filter_tables_by_time(app_tables, start_time, end_time) @@ -819,7 +897,7 @@ def import_data_from_uris( source_uris, dataset, table, - schema=None, + schema=None, job=None, source_format=None, create_disposition=None, @@ -832,6 +910,7 @@ def import_data_from_uris( field_delimiter=None, quote=None, skip_leading_rows=None, + project_id=None, ): """ Imports data into a BigQuery table from cloud storage. Optional @@ -848,11 +927,11 @@ def import_data_from_uris( String id of the dataset table : str String id of the table + schema : list, optional + Represents the BigQuery schema job : str, optional Identifies the job (a unique job id is automatically generated if - not provided) - schema : list, optional - Represents the BigQuery schema + not provided) source_format : str, optional One of the JOB_SOURCE_FORMAT_* constants create_disposition : str, optional @@ -875,6 +954,8 @@ def import_data_from_uris( Quote character for csv only skip_leading_rows : int, optional For csv only + project_id: str, optional + String id of the project Returns ------- @@ -889,9 +970,11 @@ def import_data_from_uris( source_uris = source_uris if isinstance(source_uris, list) \ else [source_uris] + project_id = self._get_project_id(project_id) + configuration = { "destinationTable": { - "projectId": self.project_id, + "projectId": project_id, "tableId": table, "datasetId": dataset }, @@ -963,10 +1046,7 @@ def import_data_from_uris( "configuration": { 'load': configuration }, - "jobReference": { - "projectId": self.project_id, - "jobId": job - } + "jobReference": self._get_job_reference(job) } logger.debug("Creating load job %s" % body) @@ -978,12 +1058,13 @@ def export_data_to_uris( self, destination_uris, dataset, - table, + table, job=None, compression=None, destination_format=None, print_header=None, field_delimiter=None, + project_id=None, ): """ Export data from a BigQuery table to cloud storage. Optional arguments @@ -998,7 +1079,7 @@ def export_data_to_uris( dataset : str String id of the dataset table : str - String id of the table + String id of the table job : str, optional String identifying the job (a unique jobid is automatically generated if not provided) @@ -1010,6 +1091,8 @@ def export_data_to_uris( Whether or not to print the header field_delimiter : str, optional Character separating fields in delimited file + project_id: str, optional + String id of the project Returns ------- @@ -1024,9 +1107,11 @@ def export_data_to_uris( destination_uris = destination_uris \ if isinstance(destination_uris, list) else [destination_uris] + project_id = self._get_project_id(project_id) + configuration = { "sourceTable": { - "projectId": self.project_id, + "projectId": project_id, "tableId": table, "datasetId": dataset }, @@ -1057,10 +1142,7 @@ def export_data_to_uris( "configuration": { 'extract': configuration }, - "jobReference": { - "projectId": self.project_id, - "jobId": job - } + "jobReference": self._get_job_reference(job) } logger.info("Creating export job %s" % body) @@ -1072,7 +1154,7 @@ def write_to_table( self, query, dataset=None, - table=None, + table=None, external_udf_uris=None, allow_large_results=None, use_query_cache=None, @@ -1081,7 +1163,8 @@ def write_to_table( write_disposition=None, use_legacy_sql=None, maximum_billing_tier=None, - flatten=None + flatten=None, + project_id=None, ): """ Write query result to table. If dataset or table is not provided, @@ -1096,7 +1179,7 @@ def write_to_table( dataset : str, optional String id of the dataset table : str, optional - String id of the table + String id of the table external_udf_uris : list, optional Contains external UDF URIs. If given, URIs must be Google Cloud Storage and have .js extensions. @@ -1122,6 +1205,8 @@ def write_to_table( flatten : bool, optional Whether or not to flatten nested and repeated fields in query results + project_id: str, optional + String id of the project Returns ------- @@ -1138,9 +1223,11 @@ def write_to_table( "query": query, } + project_id = self._get_project_id(project_id) + if dataset and table: configuration['destinationTable'] = { - "projectId": self.project_id, + "projectId": project_id, "tableId": table, "datasetId": dataset } @@ -1235,7 +1322,7 @@ def wait_for_job(self, job, interval=5, timeout=60): def push_rows(self, dataset, table, rows, insert_id_key=None, skip_invalid_rows=None, ignore_unknown_values=None, - template_suffix=None): + template_suffix=None, project_id=None): """Upload rows to BigQuery table. Parameters @@ -1243,7 +1330,7 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, dataset : str The dataset to upload to table : str - The name of the table to insert rows into + The name of the table to insert rows into rows : list A ``list`` of rows (``dict`` objects) to add to the table insert_id_key : str, optional @@ -1256,6 +1343,8 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, template_suffix : str, optional Inserts the rows into an {table}{template_suffix}. If table {table}{template_suffix} doesn't exist, create from {table}. + project_id: str, optional + The project to upload to Returns ------- @@ -1263,7 +1352,7 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, bool indicating if insert succeeded or not, or response from BigQuery if swallow_results is set for False. """ - + project_id = self._get_project_id(project_id) table_data = self.bigquery.tabledata() rows_data = [] @@ -1291,9 +1380,9 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, if template_suffix is not None: data['templateSuffix'] = template_suffix - try: + try: response = table_data.insertAll( - projectId=self.project_id, + projectId=project_id, datasetId=dataset, tableId=table, body=data @@ -1325,19 +1414,21 @@ def push_rows(self, dataset, table, rows, insert_id_key=None, }] } - def get_all_tables(self, dataset_id): + def get_all_tables(self, dataset_id, project_id=None): """Retrieve a list of tables for the dataset. Parameters ---------- dataset_id : str The dataset to retrieve table data for. + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- A ``list`` with all table names """ - tables_data = self._get_all_tables_for_dataset(dataset_id) + tables_data = self._get_all_tables_for_dataset(dataset_id, project_id) tables = [] for table in tables_data.get('tables', []): @@ -1346,7 +1437,7 @@ def get_all_tables(self, dataset_id): tables.append(table_name) return tables - def _get_all_tables(self, dataset_id, cache=False): + def _get_all_tables(self, dataset_id, cache=False, project_id=None): """Retrieve the list of tables for dataset, that respect the formats: * appid_YYYY_MM * YYYY_MM_appid @@ -1354,10 +1445,12 @@ def _get_all_tables(self, dataset_id, cache=False): Parameters ---------- dataset_id : str - The dataset to retrieve table names for + The dataset to retrieve table names for cache : bool, optional To use cached value or not (default False). Timeout value equals CACHE_TIMEOUT. + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1371,32 +1464,36 @@ def _get_all_tables(self, dataset_id, cache=False): do_fetch = False if do_fetch: - result = self._get_all_tables_for_dataset(dataset_id) + result = self._get_all_tables_for_dataset(dataset_id, project_id) self.cache[dataset_id] = (datetime.now(), result) return self._parse_table_list_response(result) - def _get_all_tables_for_dataset(self, dataset_id): + def _get_all_tables_for_dataset(self, dataset_id, project_id=None): """Retrieve a list of all tables for the dataset. Parameters ---------- dataset_id : str The dataset to retrieve table names for + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- dict A ``dict`` containing tables key with all tables """ + project_id = self._get_project_id(project_id) + result = self.bigquery.tables().list( - projectId=self.project_id, + projectId=project_id, datasetId=dataset_id).execute(num_retries=self.num_retries) page_token = result.get('nextPageToken') while page_token: res = self.bigquery.tables().list( - projectId=self.project_id, + projectId=project_id, datasetId=dataset_id, pageToken=page_token ).execute(num_retries=self.num_retries) @@ -1683,14 +1780,14 @@ def _raise_executing_exception_if_error(self, job): # DataSet manipulation methods # def create_dataset(self, dataset_id, friendly_name=None, description=None, - access=None, location=None): + access=None, location=None, project_id=None): """Create a new BigQuery dataset. Parameters ---------- dataset_id : str Unique ``str`` identifying the dataset with the project (the - referenceID of the dataset, not the integer id of the dataset) + referenceID of the dataset, not the integer id of the dataset) friendly_name: str, optional A human readable name description: str, optional @@ -1701,6 +1798,8 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, location : str, optional Indicating where dataset should be stored: EU or US (see https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1708,15 +1807,19 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, ``bool`` indicating if dataset was created or not, or response from BigQuery if swallow_results is set for False """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - dataset_data = self.dataset_resource(dataset_id, + dataset_data = self.dataset_resource(dataset_id, + project_id=project_id, friendly_name=friendly_name, description=description, access=access, - location=location) + location=location + ) - response = datasets.insert(projectId=self.project_id, + response = datasets.insert(projectId=project_id, body=dataset_data).execute( num_retries=self.num_retries) if self.swallow_results: @@ -1731,34 +1834,43 @@ def create_dataset(self, dataset_id, friendly_name=None, description=None, else: return {} - def get_datasets(self): + def get_datasets(self, project_id=None): """List all datasets in the project. + + Parameters + ---------- + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- list Dataset resources """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - request = datasets.list(projectId=self.project_id) + request = datasets.list(projectId=project_id) result = request.execute(num_retries=self.num_retries) return result.get('datasets', []) except HttpError as e: logger.error("Cannot list datasets: {0}".format(e)) return None - def delete_dataset(self, dataset_id, delete_contents=False): + def delete_dataset(self, dataset_id, delete_contents=False, project_id=None): """Delete a BigQuery dataset. Parameters ---------- dataset_id : str - Unique ``str`` identifying the datset with the project (the - referenceId of the dataset) + Unique ``str`` identifying the dataset with the project (the + referenceId of the dataset) + Unique ``str`` identifying the BigQuery project contains the dataset delete_contents : bool, optional If True, forces the deletion of the dataset even when the dataset contains data (Default = False) + project_id: str, optional Returns ------- @@ -1771,9 +1883,11 @@ def delete_dataset(self, dataset_id, delete_contents=False): HttpError 404 when dataset with dataset_id does not exist """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - request = datasets.delete(projectId=self.project_id, + request = datasets.delete(projectId=project_id, datasetId=dataset_id, deleteContents=delete_contents) response = request.execute(num_retries=self.num_retries) @@ -1790,7 +1904,7 @@ def delete_dataset(self, dataset_id, delete_contents=False): return {} def update_dataset(self, dataset_id, friendly_name=None, description=None, - access=None): + access=None, project_id=None): """Updates information in an existing dataset. The update method replaces the entire dataset resource, whereas the patch method only replaces fields that are provided in the submitted dataset resource. @@ -1799,13 +1913,15 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, ---------- dataset_id : str Unique ``str`` identifying the dataset with the project (the - referencedId of the dataset) + referencedId of the dataset) friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional An optional description of the dataset. access : list, optional Indicating access permissions + project_id: str, optional + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1813,11 +1929,17 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, ``bool`` indicating if the update was successful or not, or response from BigQuery if swallow_results is set for False. """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, friendly_name, - description, access) - request = datasets.update(projectId=self.project_id, + body = self.dataset_resource(dataset_id, + friendly_name=friendly_name, + description=description, + access=access, + project_id=project_id) + + request = datasets.update(projectId=project_id, datasetId=dataset_id, body=body) response = request.execute(num_retries=self.num_retries) @@ -1834,7 +1956,7 @@ def update_dataset(self, dataset_id, friendly_name=None, description=None, return {} def patch_dataset(self, dataset_id, friendly_name=None, description=None, - access=None): + access=None, project_id=None): """Updates information in an existing dataset. The update method replaces the entire dataset resource, whereas the patch method only replaces fields that are provided in the submitted dataset resource. @@ -1843,13 +1965,15 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, ---------- dataset_id : str Unique string idenfitying the dataset with the project (the - referenceId of the dataset) + referenceId of the dataset) friendly_name : str, optional An optional descriptive name for the dataset. description : str, optional An optional description of the dataset. access : list, optional Indicating access permissions. + project_id: str, optional + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- @@ -1857,11 +1981,16 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, ``bool`` indicating if the patch was successful or not, or response from BigQuery if swallow_results is set for False. """ - try: + project_id = self._get_project_id(project_id) + + try: datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, friendly_name, - description, access) - request = datasets.patch(projectId=self.project_id, + body = self.dataset_resource(dataset_id, + friendly_name=friendly_name, + description=description, + access=access, + project_id=project_id) + request = datasets.patch(projectId=project_id, datasetId=dataset_id, body=body) response = request.execute(num_retries=self.num_retries) if self.swallow_results: @@ -1876,14 +2005,14 @@ def patch_dataset(self, dataset_id, friendly_name=None, description=None, return {} def dataset_resource(self, ref_id, friendly_name=None, description=None, - access=None, location=None): + access=None, location=None, project_id=None): """See https://developers.google.com/bigquery/docs/reference/v2/datasets#resource Parameters ---------- ref_id : str - Dataset id (the reference id, not the integer id) + Dataset id (the reference id, not the integer id) friendly_name : str, optional An optional descriptive name for the dataset description : str, optional @@ -1892,16 +2021,20 @@ def dataset_resource(self, ref_id, friendly_name=None, description=None, Indicating access permissions location: str, optional, 'EU' or 'US' An optional geographical location for the dataset(EU or US) + project_id: str + Unique ``str`` identifying the BigQuery project contains the dataset Returns ------- dict Representing BigQuery dataset resource """ + project_id = self._get_project_id(project_id) + data = { "datasetReference": { "datasetId": ref_id, - "projectId": self.project_id + "projectId": project_id } } if friendly_name: diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index b29d0cd..435bb73 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -241,6 +241,8 @@ def _render_condition(field, field_type, comparators): else: value = _render_condition_value(value, field_type) value = "(" + value + ")" + elif condition == "IS NULL" or condition == "IS NOT NULL": + return field + " " + condition elif condition == "BETWEEN": if isinstance(value, (tuple, list, set)) and len(value) == 2: value = ' AND '.join( diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index a5e8161..1f2d247 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -67,8 +67,13 @@ def test_initialize_readonly(self, mock_build, mock_return_cred): scopes=BIGQUERY_SCOPE_READ_ONLY) self.assertTrue( mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -101,8 +106,13 @@ def test_initialize_read_write(self, mock_build, mock_return_cred): service_account, mock.ANY, scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -136,8 +146,13 @@ def test_initialize_key_file(self, mock_build, mock_return_cred): scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_p12_keyfile.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -172,8 +187,13 @@ def test_initialize_json_key_file(self, mock_open, mock_build, mock_return_cred) scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_json_keyfile_dict.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(project_id, bq_client.project_id) @@ -208,8 +228,13 @@ def test_initialize_json_key_file_without_project_id(self, mock_open, mock_build scopes=BIGQUERY_SCOPE) self.assertTrue( mock_cred.from_json_keyfile_dict.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) + mock_build.assert_called_once_with( + 'bigquery', + 'v2', + http=mock_http, + discoveryServiceUrl=mock_service_url, + cache_discovery=False + ) self.assertEquals(mock_bq, bq_client.bigquery) self.assertEquals(json_key['project_id'], bq_client.project_id) @@ -272,7 +297,9 @@ def test_query(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, - 'jobComplete': True + 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -304,6 +331,8 @@ def test_query_max_results_set(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -332,6 +361,8 @@ def test_query_timeout_set(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -357,6 +388,8 @@ def test_sync_query_timeout(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': False, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -375,6 +408,8 @@ def test_async_query_timeout(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, 'jobComplete': False, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -384,14 +419,18 @@ def test_async_query_timeout(self): self.assertEquals(results, []) def test_query_dry_run_valid(self): - """Ensure that None and an empty list is returned from the query when + """Ensure that None and [cacheHit, totalBytesProcessed] is returned from the query when dry_run is True and the query is valid. """ mock_query_job = mock.Mock() - mock_query_job.execute.return_value = {'jobReference': {}, - 'jobComplete': True} + mock_query_job.execute.return_value = { + 'jobReference': {}, + 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 + } self.mock_job_collection.query.return_value = mock_query_job @@ -403,7 +442,7 @@ def test_query_dry_run_valid(self): 'dryRun': True} ) self.assertIsNone(job_id) - self.assertEqual([], results) + self.assertEqual([False, 0], results) def test_query_dry_run_invalid(self): """Ensure that None and a dict is returned from the query when dry_run @@ -443,6 +482,8 @@ def test_query_with_results(self): 'schema': {'fields': [{'name': 'foo', 'type': 'INTEGER'}]}, 'rows': [{'f': [{'v': 10}]}], 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -466,7 +507,9 @@ def test_query_with_using_legacy_sql(self): mock_query_job.execute.return_value = { 'jobReference': expected_job_ref, - 'jobComplete': True + 'jobComplete': True, + 'cacheHit': False, + 'totalBytesProcessed': 0 } self.mock_job_collection.query.return_value = mock_query_job @@ -848,7 +891,7 @@ def test_json_job_body_constructed_correctly(self): body = { "jobReference": { "projectId": self.project_id, - "jobId": "job" + "jobId": "job", }, "configuration": { "load": { @@ -1888,9 +1931,6 @@ def setUp(self): self.client = client.BigQueryClient(self.mock_bq_service, self.project) self.body = { 'schema': {'fields': self.schema}, - 'tableReference': { - 'tableId': self.table, 'projectId': self.project, - 'datasetId': self.dataset} } self.expiration_time = 1437513693000 @@ -1916,7 +1956,8 @@ def test_table_patch_failed(self): self.client.swallow_results = True self.mock_tables.patch.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) + projectId=self.project, datasetId=self.dataset, + tableId=self.table, body=self.body) self.mock_tables.patch.return_value.execute. \ assert_called_with(num_retries=0) @@ -1943,7 +1984,8 @@ def test_table_patch_success(self): self.client.swallow_results = True self.mock_tables.patch.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) + projectId=self.project, datasetId=self.dataset, + tableId=self.table, body=self.body) self.mock_tables.patch.return_value.execute. \ assert_called_with(num_retries=0) @@ -2904,18 +2946,18 @@ def test_dataset_update_success(self): self.mock_datasets.update.return_value.execute.side_effect = [{ 'status': 'foo'}, {'status': 'bar'}] - actual = self.client.update_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) + actual = self.client.update_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) self.assertTrue(actual) self.client.swallow_results = False - actual = self.client.update_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) + actual = self.client.update_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) self.assertEqual(actual, {'status': 'bar'}) diff --git a/bigquery/version.py b/bigquery/version.py index 84c54b7..1c19d78 100644 --- a/bigquery/version.py +++ b/bigquery/version.py @@ -1 +1 @@ -__version__ = '1.13.0' +__version__ = '1.15.0' diff --git a/requirements_dev.txt b/requirements_dev.txt index 74162c3..1040dea 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,6 +1,6 @@ nose rednose -mock==1.0.1 +mock==4.0.2 coverage nose-exclude tox diff --git a/tox.ini b/tox.ini index ce76190..58dadc9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py33, py34, nightly, pypy +envlist = py27, py35, py36, nightly, pypy [testenv] commands = nosetests --logging-level=ERROR -a slow --with-coverage --cover-package=bigquery