forked from tylertreat/BigQuery-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschema_builder.py
More file actions
145 lines (119 loc) · 3.99 KB
/
schema_builder.py
File metadata and controls
145 lines (119 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from __future__ import absolute_import
__author__ = 'Aneil Mallavarapu (http://github.com/aneilbaboo)'
from datetime import datetime
import six
import dateutil.parser
from .errors import InvalidTypeException
def default_timestamp_parser(s):
try:
if dateutil.parser.parse(s):
return True
else:
return False
except:
return False
def schema_from_record(record, timestamp_parser=default_timestamp_parser):
"""Generate a BigQuery schema given an example of a record that is to be
inserted into BigQuery.
Parameters
----------
record : dict
Example of a record that is to be inserted into BigQuery
timestamp_parser : function, optional
Unary function taking a ``str`` and returning and ``bool`` that is
True if the string represents a date
Returns
-------
Schema: list
"""
return [describe_field(k, v, timestamp_parser=timestamp_parser)
for k, v in list(record.items())]
def describe_field(k, v, timestamp_parser=default_timestamp_parser):
"""Given a key representing a column name and value representing the value
stored in the column, return a representation of the BigQuery schema
element describing that field. Raise errors if invalid value types are
provided.
Parameters
----------
k : Union[str, unicode]
Key representing the column
v : Union[str, unicode, int, float, datetime, object]
Value mapped to by `k`
Returns
-------
object
Describing the field
Raises
------
Exception
If invalid value types are provided.
Examples
--------
>>> describe_field("username", "Bob")
{"name": "username", "type": "string", "mode": "nullable"}
>>> describe_field("users", [{"username": "Bob"}])
{"name": "users", "type": "record", "mode": "repeated",
"fields": [{"name":"username","type":"string","mode":"nullable"}]}
"""
def bq_schema_field(name, bq_type, mode):
return {"name": name, "type": bq_type, "mode": mode}
if isinstance(v, list):
if len(v) == 0:
raise Exception(
"Can't describe schema because of empty list {0}:[]".format(k))
v = v[0]
mode = "repeated"
else:
mode = "nullable"
bq_type = bigquery_type(v, timestamp_parser=timestamp_parser)
if not bq_type:
raise InvalidTypeException(k, v)
field = bq_schema_field(k, bq_type, mode)
if bq_type == "record":
try:
field['fields'] = schema_from_record(v, timestamp_parser)
except InvalidTypeException as e:
# recursively construct the key causing the error
raise InvalidTypeException("%s.%s" % (k, e.key), e.value)
return field
def bigquery_type(o, timestamp_parser=default_timestamp_parser):
"""Given a value, return the matching BigQuery type of that value. Must be
one of str/unicode/int/float/datetime/record, where record is a dict
containing value which have matching BigQuery types.
Parameters
----------
o : object
A Python object
time_stamp_parser : function, optional
Unary function taking a ``str`` and returning and ``bool`` that is
True if the string represents a date
Returns
-------
Union[str, None]
Name of the corresponding BigQuery type for `o`, or None if no type
could be found
Examples
--------
>>> bigquery_type("abc")
"string"
>>> bigquery_type(123)
"integer"
"""
t = type(o)
if t in six.integer_types:
return "integer"
elif (t == six.binary_type and six.PY2) or t == six.text_type:
if timestamp_parser and timestamp_parser(o):
return "timestamp"
else:
return "string"
elif t == float:
return "float"
elif t == bool:
return "boolean"
elif t == dict:
return "record"
elif t == datetime:
return "timestamp"
else:
return None # failed to find a type