Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- # In[1]:
- import sys
- import itertools
- import traceback
- import psycopg2
- import psycopg2.extras
- import psycopg2.extensions
- import networkx as nx
- conn_pg = psycopg2.connect(host='cassius.dc2.lan', database='kb_spider', user='blackduck', port=5432,
- password = 'i907$LTIfX3ar5bX', cursor_factory=psycopg2.extras.DictCursor)
- print (conn_pg)
- d_scores = {}
- # In[2]:
- def get_edges(edge_stmt, bindings):
- """Get edges of candidate pairs for building the graph"""
- edges = []
- with conn_pg.cursor() as cur:
- cur.execute(edge_stmt, bindings)
- for row in cur:
- edge = [row['p1_uuid'], row['p2_uuid']]
- edges.append(edge)
- if 'dws_jaccard' in row:
- # Fow dws candidates, confidence score is based on a weighted average of Jaccard and Containment scores
- try:
- confidence = 0.7 * float(row['dns_jaccard']) + 0.3 * float(row['dns_containment'])
- except:
- input(row)
- elif 'des_sim' in row:
- confidence = float(row['des_sim'])
- d_scores[frozenset(edge)] = confidence
- return edges
- edge_stmt_dws = """ SELECT cand.p1_uuid, cand.p2_uuid, cand.dws_jaccard, cand.dws_containment, cand.dns_jaccard, cand.dns_containment
- FROM automerge.project_pair_sig_similarity cand
- JOIN public.project AS p1 ON cand.p1_uuid = p1.uuid::text
- JOIN public.project AS p2 ON cand.p2_uuid = p2.uuid::text
- WHERE cand.category IN (1)
- AND (
- (cand.dws_jaccard >= %s AND cand.dws_containment >= %s)
- OR (cand.dws_jaccard >= 0.6 AND cand.dws_containment >= 0.9)
- OR (cand.dns_jaccard >= %s AND cand.dns_containment >= %s)
- OR (cand.dns_jaccard >= 0.6 AND cand.dns_containment >= 0.9)
- )
- AND p1.state_code = 1002
- AND p2.state_code = 1002;
- """
- bindings = (0.7, 0.7, 0.9, 0.9)
- ## Data shape: These projects are duplicated
- # pid1 | pid2
- # 1 | 2
- # 2 | 1
- # 3 | 2
- # 4 | 5
- # 6 | 7
- # I want
- # group1: 1,2,3
- # group2: 4,5
- # goup3: 6
- graph = nx.Graph()
- graph.add_edges_from(get_edges(edge_stmt_dws, bindings))
- print("Number of edges: " + str(graph.number_of_edges()))
- print("Number of nodes: " + str(graph.number_of_nodes()))
- print("Number of connected_components: " + str(nx.number_connected_components(graph)))
- # In[ ]:
- # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement