Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- Euclidean squared distance (https://towardsdatascience.com/how-to-do-text-similarity-search-and-document-clustering-in-bigquery-75eb8f45ab65)
- CREATE TEMPORARY FUNCTION td(a ARRAY<FLOAT64>, b ARRAY<FLOAT64>, idx INT64) AS (
- (a[OFFSET(idx)] - b[OFFSET(idx)]) * (a[OFFSET(idx)] - b[OFFSET(idx)])
- );
- CREATE TEMPORARY FUNCTION term_distance(a ARRAY<FLOAT64>, b ARRAY<FLOAT64>) AS ((
- SELECT SQRT(SUM( td(a, b, idx))) FROM UNNEST(GENERATE_ARRAY(0, 19)) idx
- ));
- SELECT
- term_distance(arr1,arr2) as same,
- term_distance(arr3,arr4) as different,
- FROM (
- SELECT
- (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'the cat is crazy'AS text))) AS arr1,
- (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'the cat is crazy' AS text))) AS arr2,
- (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'to be or not to be'AS text))) AS arr3,
- (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'windows was developed by bill gates' AS text))) AS arr4
- )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement