Untitled

import networkx as nx
import random as rand
from networkx.algorithms import bipartite
from sklearn.metrics import roc_auc_score
import pandas as pd


def searchForPattern(A,x,Neighbor_of_V,Neighbor_of_U,y_true):


    search_pattern = set()
    pattern = set()

    nei_A = Neighbor_of_U[A]

    nei_x = Neighbor_of_V[x]

    if A in nei_x:
        return "DL"

    else:
        nei_nei_x = set()
        for n in nei_x:
            nei_nei_x=nei_nei_x.union(Neighbor_of_U[n])

        if x in nei_nei_x:
            nei_nei_x.remove(x)

        nei_nei_A = set()
        for n in nei_A:
            nei_nei_A = nei_nei_A.union(Neighbor_of_V[n])

        if A in nei_nei_A:
            nei_nei_A.remove(A)

        V_part_for_projection = set()

        V_part_for_projection = nei_A.intersection(nei_nei_x)

        nei_V_part_for_projection = set()

        for e in V_part_for_projection:
            nei_V_part_for_projection=nei_V_part_for_projection.union(Neighbor_of_V[e])

        if len(V_part_for_projection) > 0:

            for e in nei_V_part_for_projection:
                if e in nei_x:

                    tup = (A,e)

                    pattern.add(tup)

            return pattern

        else:

            for U in nei_nei_A:
#                print('U',U)
                tup = (U,x)
                search_pattern.add(tup)
#                 print('upper',search_pattern)

            for V in nei_nei_x:
#                print('V',V)
                tup = (A,V)
                search_pattern.add(tup)

            for pt in search_pattern:
                a,b = pt
                a_index = sorted(list(set(DRUGS))).index(a)+1
#                print(a_index)
                b_index = sorted(list(set(ISE))).index(b)+1
#                print(b_index)

                index = (a_index-1)*(b_index)+b_index

                c = len(set(ISE))
                index = (a_index-1)*c+b_index - 1

                if y_true[index] == 1:
                    Neighbor_of_U[a].add(b)
                    Neighbor_of_V[b].add(a)
#                    DRUGS.append(a)
#                    ISE.append(b)

                else:
                    return 'NL'
            pattern=searchForPattern(A,x,Neighbor_of_V,Neighbor_of_U,y_true)
            return pattern


def sample(ISE,DRUGS,Neighbor_of_U,Neighbor_of_V,node_pair_list):
    for i in rand.sample(range(1,800),200):
        ise_remove = ISE[i]
        drugs_remove = DRUGS[i]

        Neighbor_of_U[drugs_remove].remove(ise_remove)
        Neighbor_of_V[ise_remove].remove(drugs_remove)

        tup = (drugs_remove,ise_remove)

        node_pair_list.remove(tup)

    return node_pair_list, Neighbor_of_U, Neighbor_of_V


def calc_weight(pattern,GU_new,GV_new,Neighbor_of_U):
    score = 0.0

    if pattern=='DL':
        score = 1.0
    elif pattern == 'NL':
        score = 0.0

    else:

        for pt in pattern:
            u,v = pt
            deg_u = GU_new.degree(u)
            deg_v = GU_new.degree(v)
            common_nei = set()
            common_nei = Neighbor_of_U[u].union(Neighbor_of_U[v])
#            print(common_nei)
            if len(common_nei) == 0:
                score = 0.0
            else:
                cn_score = 0.0

                try:
                    for cn in common_nei:
    #                    print(GV_new.degree(cn))
                        cn_score += (1/GV_new.degree(cn))

                except ZeroDivisionError:
                        score = 0.0

                try:
                    score += (2/(deg_u+deg_v))*cn_score
                except ZeroDivisionError:
                    score = 0.0


    return(score)


if __name__ == '__main__':
    filename = "/home/gaudel/Desktop/monopharmacy.csv"
    df_edge_list = pd.read_csv(filename).drop("SEN",axis=1).head(5000)
    ISE_in = df_edge_list["ISE"].values.tolist()
    ISE = [str(a) for a in ISE_in]
    DRUGS = df_edge_list["DRUGS"].values.tolist()

    Neighbor_of_U = {}
    for u,v in zip(DRUGS,ISE):
        if u not in Neighbor_of_U:
            Neighbor_of_U[u] = set()
        Neighbor_of_U[u].add(v)

    Neighbor_of_V = {}
    for u,v in zip(ISE,DRUGS):
        if u not in Neighbor_of_V:
            Neighbor_of_V[u] = set()
        Neighbor_of_V[u].add(v)

    node_pair_list = set()
    for u,v in zip(DRUGS,ISE):
        tup = (u,v)
        node_pair_list.add(tup)
    node_pair_list

    _true = []
    for x in sorted(list(set(DRUGS))):
        for y in sorted(list(set(ISE))):
            tup = (x,y)
    #         print(tup)
            if tup in node_pair_list:
                _true.append(1)
            else:
                _true.append(0)


    w= []

    B = nx.Graph()
    B.add_nodes_from(list(set(DRUGS)),bipartite=0)
    B.add_nodes_from(list(set(ISE)),bipartite=1)

    edge_list,Neighbor_of_U,Neighbor_of_V = sample(ISE,DRUGS,Neighbor_of_U,Neighbor_of_V,node_pair_list) ### calling sample function

    B.add_edges_from(edge_list)

    GU = bipartite.weighted_projected_graph(B,DRUGS)
    GV = bipartite.weighted_projected_graph(B,ISE)

    GU_new = nx.Graph()
    GU_new.add_nodes_from(set(DRUGS))
    GV_new = nx.Graph()
    ise_filter_edges = []

    for edges in GV.edges(data=True):
        if edges[2]['weight']>0:
            ise_filter_edges.append(edges)

    GV_new.add_nodes_from(set(ISE))
    GV_new.add_edges_from(ise_filter_edges)
#    print(GV.edges(data=True))

    drugs_filter_edges = []
    for edges in GU.edges(data=True):
        if edges[2]['weight']>18:
            drugs_filter_edges.append(edges)


    GU_new.add_nodes_from(DRUGS)
    GU_new.add_edges_from(drugs_filter_edges)
#    GU_new.edges(data=True)

#    GU_new.add_edges_from(filtered_edge)
#    print(GU.edges(data=True))

    for c in sorted(list(set(DRUGS))):
        for d in sorted(list(set(ISE))):
            pattern=searchForPattern(c,d,Neighbor_of_V,Neighbor_of_U,_true) ## search for pattern function
            score=calc_weight(pattern,GU_new,GV_new,Neighbor_of_U)   ## calling calc_weight function
            w.append(score)
            print(c,d,' : ',score)
    #print(_true)
    #print(w)
    auc = roc_auc_score(y_true=_true,y_score=w)
    print('auc',auc)