BigQuery Expert Engineer Skill This skill provides a comprehensive guide for BigQuery development. 1. bq Command Line Tool Basics 1.1. Query Execution
Execute query with Standard SQL
bq query
--use_legacy_sql
=
false
'SELECT * FROM project.dataset.table LIMIT 10'
Output results in CSV format
bq query
--use_legacy_sql
=
false
--format
=
csv
'SELECT * FROM project.dataset.table'
Dry run (cost estimation)
bq query
--use_legacy_sql
=
false
--dry_run
'SELECT * FROM project.dataset.table'
Save results to table
bq query
--use_legacy_sql
=
false
--destination_table
=
project:dataset.result_table
'SELECT * FROM project.dataset.table'
1.2. Table Operations
List tables
bq ls project:dataset
Check table schema
bq show --schema --format = prettyjson project:dataset.table
Create table (from schema file)
bq mk --table project:dataset.table schema.json
Create partitioned table
bq mk --table --time_partitioning_field = created_at project:dataset.table schema.json
Create clustered table
bq mk --table --clustering_fields = user_id,category project:dataset.table schema.json
Delete table
bq rm -t project:dataset.table 1.3. Data Load/Export
Load from CSV
bq load --source_format = CSV project:dataset.table gs://bucket/data.csv schema.json
Load from JSON
bq load --source_format = NEWLINE_DELIMITED_JSON project:dataset.table gs://bucket/data.json
Load from Parquet (auto-detect schema)
bq load --source_format = PARQUET --autodetect project:dataset.table gs://bucket/data.parquet
Export to Cloud Storage
- bq extract
- --destination_format
- =
- CSV project:dataset.table gs://bucket/export/*.csv
- 2. GoogleSQL Basic Syntax
- 2.1. SELECT Statement
- -- Basic SELECT
- SELECT
- column1
- ,
- column2
- ,
- COUNT
- (
- *
- )
- AS
- count
- FROM
- `
- project.dataset.table
- `
- WHERE
- date
- >=
- '2024-01-01'
- GROUP
- BY
- column1
- ,
- column2
- HAVING
- COUNT
- (
- *
- )
- >
- 10
- ORDER
- BY
- count
- DESC
- LIMIT
- 100
- 2.2. Common Functions
- -- String functions
- CONCAT
- (
- str1
- ,
- str2
- )
- LOWER
- (
- str
- )
- ,
- UPPER
- (
- str
- )
- TRIM
- (
- str
- )
- ,
- LTRIM
- (
- str
- )
- ,
- RTRIM
- (
- str
- )
- SUBSTR
- (
- str
- ,
- start
- ,
- length
- )
- REGEXP_CONTAINS
- (
- str
- ,
- r
- 'pattern'
- )
- REGEXP_EXTRACT
- (
- str
- ,
- r
- 'pattern'
- )
- SPLIT
- (
- str
- ,
- delimiter
- )
- -- Date/time functions
- CURRENT_DATE
- (
- )
- ,
- CURRENT_TIMESTAMP
- (
- )
- DATE
- (
- timestamp
- )
- ,
- TIMESTAMP
- (
- date
- )
- DATE_ADD
- (
- date
- ,
- INTERVAL
- 1
- DAY
- )
- DATE_DIFF
- (
- date1
- ,
- date2
- ,
- DAY
- )
- FORMAT_DATE
- (
- '%Y-%m-%d'
- ,
- date
- )
- PARSE_DATE
- (
- '%Y%m%d'
- ,
- str
- )
- EXTRACT
- (
- YEAR
- FROM
- date
- )
- -- Aggregate functions
- COUNT
- (
- *
- )
- ,
- COUNT
- (
- DISTINCT
- column
- )
- SUM
- (
- column
- )
- ,
- AVG
- (
- column
- )
- MIN
- (
- column
- )
- ,
- MAX
- (
- column
- )
- ARRAY_AGG
- (
- column
- )
- STRING_AGG
- (
- column
- ,
- ','
- )
- -- Window functions
- ROW_NUMBER
- (
- )
- OVER
- (
- PARTITION
- BY
- col
- ORDER
- BY
- col2
- )
- RANK
- (
- )
- OVER
- (
- ORDER
- BY
- col
- DESC
- )
- LAG
- (
- col
- ,
- 1
- )
- OVER
- (
- ORDER
- BY
- date
- )
- LEAD
- (
- col
- ,
- 1
- )
- OVER
- (
- ORDER
- BY
- date
- )
- SUM
- (
- col
- )
- OVER
- (
- PARTITION
- BY
- category
- )
- 2.3. JOIN Syntax
- -- INNER JOIN
- SELECT
- a
- .
- *
- ,
- b
- .
- column
- FROM
- `
- project.dataset.table_a
- `
- AS
- a
- INNER
- JOIN
- `
- project.dataset.table_b
- `
- AS
- b
- ON
- a
- .
- id
- =
- b
- .
- id
- -- LEFT JOIN
- SELECT
- a
- .
- *
- ,
- b
- .
- column
- FROM
- `
- project.dataset.table_a
- `
- AS
- a
- LEFT
- JOIN
- `
- project.dataset.table_b
- `
- AS
- b
- ON
- a
- .
- id
- =
- b
- .
- id
- -- CROSS JOIN (commonly used for array expansion)
- SELECT
- *
- FROM
- `
- project.dataset.table
- `
- ,
- UNNEST
- (
- array_column
- )
- AS
- element
- 2.4. CTE (Common Table Expressions)
- WITH
- base_data
- AS
- (
- SELECT
- *
- FROM
- `
- project.dataset.table
- `
- WHERE
- date
- >=
- '2024-01-01'
- )
- ,
- aggregated
- AS
- (
- SELECT
- category
- ,
- COUNT
- (
- *
- )
- AS
- count
- FROM
- base_data
- GROUP
- BY
- category
- )
- SELECT
- *
- FROM
- aggregated
- ORDER
- BY
- count
- DESC
- 3. Table Design
- 3.1. Partitioning
- Divide data by date to reduce query scan volume.
- -- Create date-partitioned table
- CREATE
- TABLE
- `
- project.dataset.partitioned_table
- `
- PARTITION
- BY
- DATE
- (
- created_at
- )
- AS
- SELECT
- *
- FROM
- `
- project.dataset.source_table
- `
- ;
- -- Integer partitioning
- CREATE
- TABLE
- `
- project.dataset.int_partitioned
- `
- PARTITION
- BY
- RANGE_BUCKET
- (
- user_id
- ,
- GENERATE_ARRAY
- (
- 0
- ,
- 1000000
- ,
- 10000
- )
- )
- AS
- SELECT
- *
- FROM
- source
- ;
- -- Require partition filter
- CREATE
- TABLE
- `
- project.dataset.table
- `
- PARTITION
- BY
- DATE
- (
- created_at
- )
- OPTIONS
- (
- require_partition_filter
- =
- TRUE
- )
- ;
- 3.2. Clustering
- Sort and group data by specified columns.
- -- Clustering table
- CREATE
- TABLE
- `
- project.dataset.clustered_table
- `
- PARTITION
- BY
- DATE
- (
- created_at
- )
- CLUSTER
- BY
- user_id
- ,
- category
- AS
- SELECT
- *
- FROM
- source
- ;
- 3.3. Best Practices
- Combine partitioning and clustering
- Choose columns frequently filtered in queries
- Maximum 4 clustering columns
- Prioritize high-cardinality columns
- 4. Performance Optimization
- 4.1. Query Optimization
- -- Avoid SELECT *
- -- Bad
- SELECT
- *
- FROM
- table
- ;
- -- Good
- SELECT
- column1
- ,
- column2
- FROM
- table
- ;
- -- Leverage partition pruning
- -- Bad (function applied to partition column)
- WHERE
- DATE
- (
- created_at
- )
- =
- '2024-01-01'
- -- Good
- WHERE
- created_at
- >=
- '2024-01-01'
- AND
- created_at
- <
- '2024-01-02'
- -- Use APPROX_ functions for estimates (faster)
- SELECT
- APPROX_COUNT_DISTINCT
- (
- user_id
- )
- FROM
- table
- ;
- 4.2. JOIN Optimization
- -- Put smaller table on right side (broadcast JOIN)
- SELECT
- *
- FROM
- large_table
- JOIN
- small_table
- ON
- large_table
- .
- id
- =
- small_table
- .
- id
- ;
- -- JOIN only needed columns
- WITH
- filtered
- AS
- (
- SELECT
- id
- ,
- needed_column
- FROM
- large_table
- WHERE
- condition
- )
- SELECT
- *
- FROM
- filtered
- JOIN
- other_table
- ON
- .
- .
- .
- 4.3. Check Slot Usage
- -- Check job statistics
- SELECT
- job_id
- ,
- total_bytes_processed
- ,
- total_slot_ms
- ,
- TIMESTAMP_DIFF
- (
- end_time
- ,
- start_time
- ,
- SECOND
- )
- AS
- duration_sec
- FROM
- `
- region-us
- `
- .
- INFORMATION_SCHEMA
- .
- JOBS
- WHERE
- creation_time
- >
- TIMESTAMP_SUB
- (
- CURRENT_TIMESTAMP
- (
- )
- ,
- INTERVAL
- 1
- DAY
- )
- ORDER
- BY
- total_slot_ms
- DESC
- LIMIT
- 10
- ;
- 5. Cost Management
- 5.1. Pricing Model
- On-demand: Based on scanned data ($5/TB)
- Flat-rate (Editions): Based on reserved slots
- Storage: Active $0.02/GB, Long-term $0.01/GB
- 5.2. Cost Reduction Best Practices
- Avoid SELECT *
- Always use partition filters
- Check cost with dry run before queries
- Optimize repeated queries with materialized views
- Speed up dashboard queries with BI Engine
- 5.3. Custom Quota Settings
- -- Set query byte limit per project
- -- Configure in Cloud Console or gcloud
- 6. Data Governance
- 6.1. IAM Roles
- roles/bigquery.admin
-
- Full permissions
- roles/bigquery.dataEditor
-
- Read/write data
- roles/bigquery.dataViewer
-
- Read-only data
- roles/bigquery.jobUser
-
- Execute jobs
- roles/bigquery.user
- List datasets, execute jobs
6.2. Column-level Security
-- Apply policy tag
ALTER
TABLE
project.dataset.tableALTER COLUMN sensitive_column SET OPTIONS ( policy_tags = [ 'projects/project/locations/us/taxonomies/123/policyTags/456' ] ) ; 6.3. Row-level Security -- Create row access policy CREATE ROW ACCESS POLICY region_filter ONproject.dataset.tableGRANT TO ( 'user:analyst@example.com' ) FILTER USING ( region = 'APAC' ) ; 7. BigQuery ML 7.1. Model Creation -- Linear regression model CREATE OR REPLACE MODELproject.dataset.modelOPTIONS ( model_type = 'LINEAR_REG' , input_label_cols = [ 'target' ] ) AS SELECT feature1 , feature2 , target FROMproject.dataset.training_data; -- Logistic regression model CREATE OR REPLACE MODELproject.dataset.classifierOPTIONS ( model_type = 'LOGISTIC_REG' , input_label_cols = [ 'label' ] ) AS SELECT * FROM training_data ; 7.2. Model Evaluation and Prediction -- Model evaluation SELECT * FROM ML . EVALUATE ( MODELproject.dataset.model) ; -- Prediction SELECT * FROM ML . PREDICT ( MODELproject.dataset.model, ( SELECT * FROMproject.dataset.new_data) ) ; 8. External Data Sources 8.1. External Tables -- Reference Cloud Storage CSV as external table CREATE EXTERNAL TABLEproject.dataset.external_tableOPTIONS ( format = 'CSV' , uris = [ 'gs://bucket/path/.csv' ] , skip_leading_rows = 1 ) ; -- Parquet external table CREATE EXTERNAL TABLEproject.dataset.parquet_tableOPTIONS ( format = 'PARQUET' , uris = [ 'gs://bucket/path/.parquet' ] ) ; 8.2. Federated Query -- Connect to Cloud SQL SELECT * FROM EXTERNAL_QUERY ( 'projects/project/locations/us/connections/connection_id' , 'SELECT * FROM mysql_table' ) ; 9. Scheduled Queries 9.1. Configuration Example -- Configure in Cloud Console or bq command -- Run daily at 2 AM bq query --use_legacy_sql=false \ --schedule='every 24 hours' \ --display_name='Daily aggregation' \ --destination_table='project:dataset.daily_summary' \ --replace \ 'SELECT DATE(created_at) as date, COUNT(*) as count FROM source GROUP BY 1' 10. Reference Links Official docs: https://cloud.google.com/bigquery/docs GoogleSQL reference: https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax Function reference: https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-all bq command reference: https://cloud.google.com/bigquery/docs/bq-command-line-tool BigQuery ML: https://cloud.google.com/bigquery/docs/bqml-introduction Pricing: https://cloud.google.com/bigquery/pricing