Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- id1 id2 weights
- 0 a 2a 144.0
- 1 a 2b 52.5
- 2 a 2c 2.0
- 3 a 2d 1.0
- 4 a 2e 1.0
- 5 b 2a 2.0
- 6 b 2e 1.0
- 7 b 2f 1.0
- 8 b 2b 1.0
- 9 b 2c 0.008
- 2a 2b 2c 2d 2e 2f
- 2a 1 0.5 0.7 0.2 0.1 0.3
- 2b 0.5 1 0.6 0.4 0.3 0.4
- 2c 0.7 0.6 1 0.1 0.4 0.2
- 2d 0.2 0.4 0.1 1 0.8 0.7
- 2e 0.1 0.3 0.4 0.8 1 0.8
- 2f 0.3 0.4 0.2 0.7 0.8 1
- ids = df.id1.unique()
- output = pd.DataFrame(columns = mat.columns,index = ids)
- for id in ids:
- df_slice = df.loc[df.id1 == id]
- to_normalize = df_slice.weights.sum()
- temp = mat.loc[df_slice.id2]
- for art in df_slice.id2:
- temp.loc[art] *= df_slice.ix[df_slice.id2 == art,'weights'].values[0]
- temp.loc[art] /= (1.*to_normalize)
- output.loc[id] = temp.sum()
- 2a 2b 2c 2d 2e 2f
- a 0.857606 0.630424 0.672319 0.258354 0.163342 0.329676
- b 0.580192 0.540096 0.520767 0.459425 0.459904 0.559425
- import io
- import pandas as pd
- raw_df = io.StringIO("""
- id1 id2 weights
- 0 a 2a 144.0
- 1 a 2b 52.5
- 2 a 2c 2.0
- 3 a 2d 1.0
- 4 a 2e 1.0
- 5 b 2a 2.0
- 6 b 2e 1.0
- 7 b 2f 1.0
- 8 b 2b 1.0
- 9 b 2c 0.008
- """)
- df = pd.read_csv(raw_df, delim_whitespace=True)
- raw_mat = io.StringIO("""
- 2a 2b 2c 2d 2e 2f
- 2a 1 0.5 0.7 0.2 0.1 0.3
- 2b 0.5 1 0.6 0.4 0.3 0.4
- 2c 0.7 0.6 1 0.1 0.4 0.2
- 2d 0.2 0.4 0.1 1 0.8 0.7
- 2e 0.1 0.3 0.4 0.8 1 0.8
- 2f 0.3 0.4 0.2 0.7 0.8 1
- """)
- mat = pd.read_csv(raw_mat, delim_whitespace=True)
- df['norm'] = df.groupby('id1')['weights'].transform('sum')
- m = pd.merge(df, mat, left_on='id2', right_index=True)
- m[mat.index] = m[mat.index].multiply(m['weights'] / m['norm'], axis=0)
- output = m.groupby('id1')[mat.index].sum()
- output.columns.name = 'id2'
- print(output)
- id2 2a 2b 2c 2d 2e 2f
- id1
- a 0.857606 0.630424 0.672319 0.258354 0.163342 0.329676
- b 0.580192 0.540096 0.520767 0.459425 0.459904 0.559425
- disease symptom frequence
- 0 d1 s1 Very frequent (99-80%)
- 1 d1 s2 Very frequent (99-80%)
- 2 d2 s1 Frequent (79-30%)
- 3 d2 s3 Very frequent (99-80%)
- 4 d3 s2 Occasional (29-5%
- 5 d4 s1 Very frequent (99-80%)
- 6 d4 s2 Frequent (79-30%)
- 7 d4 s3 Occasional (29-5%
- 8 d5 s3 Occasional (29-5%
- 9 d5 s4 Very frequent (99-80%)
- s1 s2 s3 s4 s5 s6
- d1 1 1 0 0 0 0
- d2 1 0 1 0 0 0
- d3 0 1 1 1 1 1
- d4 1 0 1 0 0 0
- d5 0 0 1 1 0 0
- import pandas as pd
- import numpy as np
- import io
- data = pd.read_csv("disease_sym_frq_list.csv", sep="[;,]", engine='python')
- data
- dat_mat= io.StringIO("""data
- """)
- mat = pd.read_csv(dat_mat, delim_whitespace=True)
- data['norm'] = data.groupby('Disease')['Frequence'].transform('sum')
- m = pd.merge(data, mat, left_on='Symptom', right_index=True)
- m[mat.index] = m[mat.index].multiply(m['Frequence'] / m['norm'], axis=0)
- output = m.groupby('Disease')[mat.index].sum()
- output.columns.name = 'Symptom'
- print(output)
- Empty DataFrame
- Columns: []
- Index: []
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement