I have a table of data (data_table
) and a second table with ranges (range_table
). I need an answer to which rows in the data table are not contained in the ranges table.
I expect to have most cardinality in column A as that is where I set up the partition, and some cardinality across columns B and C so set up the clusters on those columns.
The below query runs for about 20 minutes before failing with "Query exceeded resource limits".
Is there a way to set up partitions and clusters so that BigQuery will leverage them in the final join and so the query performs well? Any other thoughts?
Here is a stand-in sample:
-- Step 1: Create and populate data_table with random data
CREATE TEMP TABLE data_table
PARTITION BY RANGE_BUCKET(A, GENERATE_ARRAY(0, 100000, 10000))
CLUSTER BY A, B, C
AS (
SELECT
CAST(FLOOR(RAND() * 100001) AS INT64) AS A, -- Random integer 0 to 100,000
1690 + 10 * CAST(FLOOR(RAND() * 32) AS INT64) AS B, -- Random value from 1690, 1700, ..., 2000
1000 * CAST(FLOOR(RAND() * 101) AS INT64) AS C, -- Random value from 0, 1000, ..., 100,000
1 + CAST(FLOOR(RAND() * 5) AS INT64) AS D, -- Random integer 1 to 5
1 + CAST(FLOOR(RAND() * 6) AS INT64) AS E, -- Random integer 1 to 6
1 + CAST(FLOOR(RAND() * 4) AS INT64) AS F -- Random integer 1 to 4
FROM
UNNEST(GENERATE_ARRAY(1, 1000000)) -- Generate 1 million rows
);
-- Step 2: Create and populate range_table with random ranges
CREATE TEMP TABLE range_table
PARTITION BY RANGE_BUCKET(AMin, GENERATE_ARRAY(0, 100000, 10000))
CLUSTER BY AMin, BMin, CMin
AS (
SELECT
-- A range: 0 to 100,000
LEAST(CAST(FLOOR(RAND() * 100001) AS INT64), CAST(FLOOR(RAND() * 100001) AS INT64)) AS AMin,
GREATEST(CAST(FLOOR(RAND() * 100001) AS INT64), CAST(FLOOR(RAND() * 100001) AS INT64)) AS AMax,
-- B range: 1690 to 2000 in steps of 10
LEAST(1690 + 10 * CAST(FLOOR(RAND() * 32) AS INT64), 1690 + 10 * CAST(FLOOR(RAND() * 32) AS INT64)) AS BMin,
GREATEST(1690 + 10 * CAST(FLOOR(RAND() * 32) AS INT64), 1690 + 10 * CAST(FLOOR(RAND() * 32) AS INT64)) AS BMax,
-- C range: 0 to 100,000 in steps of 1000
LEAST(1000 * CAST(FLOOR(RAND() * 101) AS INT64), 1000 * CAST(FLOOR(RAND() * 101) AS INT64)) AS CMin,
GREATEST(1000 * CAST(FLOOR(RAND() * 101) AS INT64), 1000 * CAST(FLOOR(RAND() * 101) AS INT64)) AS CMax,
-- D range: 1 to 5
LEAST(1 + CAST(FLOOR(RAND() * 5) AS INT64), 1 + CAST(FLOOR(RAND() * 5) AS INT64)) AS DMin,
GREATEST(1 + CAST(FLOOR(RAND() * 5) AS INT64), 1 + CAST(FLOOR(RAND() * 5) AS INT64)) AS DMax,
-- E range: 1 to 6
LEAST(1 + CAST(FLOOR(RAND() * 6) AS INT64), 1 + CAST(FLOOR(RAND() * 6) AS INT64)) AS EMin,
GREATEST(1 + CAST(FLOOR(RAND() * 6) AS INT64), 1 + CAST(FLOOR(RAND() * 6) AS INT64)) AS EMax,
-- F range: 1 to 4
LEAST(1 + CAST(FLOOR(RAND() * 4) AS INT64), 1 + CAST(FLOOR(RAND() * 4) AS INT64)) AS FMin,
GREATEST(1 + CAST(FLOOR(RAND() * 4) AS INT64), 1 + CAST(FLOOR(RAND() * 4) AS INT64)) AS FMax
FROM
UNNEST(GENERATE_ARRAY(1, 1000000)) -- Generate 1 million rows
);
-- Step 3: Execute the final query to find non-matching rows
SELECT dt.*
FROM data_table AS dt
LEFT JOIN range_table AS rt
ON dt.A BETWEEN rt.AMin AND rt.AMax
AND dt.B BETWEEN rt.BMin AND rt.BMax
AND dt.C BETWEEN rt.CMin AND rt.CMax
AND dt.D BETWEEN rt.DMin AND rt.DMax
AND dt.E BETWEEN rt.EMin AND rt.EMax
AND dt.F BETWEEN rt.FMin AND rt.FMax
AND rt.AMin >= (FLOOR(dt.A / 10000) * 10000) -- Partition pruning condition
AND rt.AMin < ((FLOOR(dt.A / 10000) + 1) * 10000) -- Partition pruning condition
WHERE
rt.AMin IS NULL;
发布者:admin,转转请注明出处:http://www.yc00.com/questions/1745171061a4614926.html
评论列表(0条)