Advertisement
jroakes

Term Distance formula

Jul 7th, 2021
933
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
MySQL 1.09 KB | None | 0 0
  1. -- Euclidean squared distance (https://towardsdatascience.com/how-to-do-text-similarity-search-and-document-clustering-in-bigquery-75eb8f45ab65)
  2.  
  3. CREATE TEMPORARY FUNCTION td(a ARRAY<FLOAT64>, b ARRAY<FLOAT64>, idx INT64) AS (
  4.    (a[OFFSET(idx)] - b[OFFSET(idx)]) * (a[OFFSET(idx)] - b[OFFSET(idx)])
  5. );
  6. CREATE TEMPORARY FUNCTION term_distance(a ARRAY<FLOAT64>, b ARRAY<FLOAT64>) AS ((
  7.    SELECT SQRT(SUM( td(a, b, idx))) FROM UNNEST(GENERATE_ARRAY(0, 19)) idx
  8. ));
  9.  
  10. SELECT
  11.     term_distance(arr1,arr2) as same,
  12.     term_distance(arr3,arr4) as different,
  13.  
  14. FROM (
  15. SELECT
  16.     (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'the cat is crazy'AS text))) AS arr1,
  17.     (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'the cat is crazy' AS text))) AS arr2,
  18.     (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'to be or not to be'AS text))) AS arr3,
  19.     (SELECT encoder FROM ML.PREDICT(MODEL `project.bigquery_ml.embedding_model`,(SELECT 'windows was developed by bill gates' AS text))) AS arr4
  20.     )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement