Guest User

pairs_clustering.py

a guest
Aug 6th, 2017
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.40 KB | None | 0 0
  1. import tensorflow as tf
  2. import numpy as np
  3. from matplotlib.pyplot import cm
  4. import matplotlib.pyplot as plt
  5. from sklearn.datasets.samples_generator import make_blobs
  6.  
  7. def categorical_scatter_2d(X2D, class_idxs, ms=3, ax=None, alpha=1.0):
  8.     ## Plot a 2D matrix with corresponding class labels: each class diff colour
  9.     if ax is None:
  10.         fig, ax = plt.subplots()
  11.     classes = np.unique(class_idxs)
  12.     colors = cm.rainbow(np.linspace(0,1, len(classes)))
  13.     for i, cls in enumerate(classes):
  14.         ax.scatter(X2D[class_idxs==cls, 0], X2D[class_idxs==cls, 1],
  15.                    label=str(cls), alpha=alpha, color=colors[i])
  16.     return ax
  17.  
  18. def cos_sim(A):
  19.     similarity = np.dot(A, A.T)
  20.    
  21.     # squared magnitude of preference vectors (number of occurrences)
  22.     square_mag = np.diag(similarity)
  23.    
  24.     # inverse squared magnitude
  25.     inv_square_mag = 1 / square_mag
  26.    
  27.     # if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
  28.     inv_square_mag[np.isinf(inv_square_mag)] = 0
  29.    
  30.     # inverse of the magnitude
  31.     inv_mag = np.sqrt(inv_square_mag)
  32.    
  33.     # cosine similarity (elementwise multiply by inverse magnitudes)
  34.     cosine = similarity * inv_mag
  35.     cosine = cosine.T * inv_mag
  36.     return cosine
  37.  
  38. # First we make the 'true' latent space for generating our observed pair
  39. # similarities
  40. N = 500
  41. # This all still works if the latent space is higher dimensional
  42. D = 2
  43. z_true, labels = make_blobs(n_samples=N, cluster_std=1., centers=10,
  44.                             n_features=D, random_state=2)
  45. categorical_scatter_2d(z_true, labels)
  46. plt.show()
  47. # Create synthetic 'pair probabilities' based on cosine similarity or sigmoid
  48. # of dot product. I found the latter works better. You might need to ensure
  49. # your input data is spiky, or play with other aspects to get it to work if not
  50.  
  51. #sims = np.minimum(np.maximum(cos_sim(z_true) / 2. + 0.5, 0.0), 1.0) ** 10
  52. sims = 1. / (1. + np.exp(-np.dot(z_true, z_true.T)))
  53. plt.hist(sims.reshape(-1))
  54. plt.show()
  55.  
  56. # Now we make a model to find latent vectors whose similarity predicts our
  57. # pair probabilities well
  58.  
  59. tf.reset_default_graph()
  60.  
  61. # Dimensionality of our estimated latent space
  62. D_hat = 2
  63.  
  64. # Iniitialize a latent vector for each of our examples
  65. zhat = tf.get_variable('zhat', [N, D_hat],
  66.                        initializer=tf.random_normal_initializer(0, 1.0))
  67. # We have the target probabilities in a matrix
  68. # Note there's no means to deal with the random censoring in your data
  69. # You could just eliminate these pairs the loss calculation with a mask
  70. p_target = tf.constant(sims.astype(np.float32), name='target')
  71.  
  72. # Estimated probability of a matching pair
  73. p_hat = tf.nn.sigmoid(tf.matmul(zhat, zhat, transpose_b=True))
  74.  
  75. # Cross entropy cost
  76. xent_mat = p_target * tf.log(p_hat + 1e-8) + \
  77.             (1 - p_target) * tf.log(1 - p_hat + 1e-8)
  78. # Mean of cross-entropy --> ignore elements on the diagonal which have prob=1.0
  79. xent = -tf.reduce_mean(xent_mat * -(tf.diag(tf.ones([tf.shape(p_hat)[0]]))-1))
  80.  
  81. # Train op + init
  82. train_op = tf.train.AdamOptimizer(0.01).minimize(xent)
  83. sess = tf.InteractiveSession()
  84. sess.run(tf.global_variables_initializer())
  85.  
  86. # Train then plot the latent space
  87. for i in range(1000):
  88.     if i % 100 == 0:
  89.         print sess.run([xent, train_op])[0]
  90.     else:
  91.         sess.run(train_op)
  92. zhat_ = sess.run(zhat)
  93.  
  94. # You obviously won't know the classes for the latent space, but maybe you can
  95. # run k-means or other clustering on it...
  96. categorical_scatter_2d(zhat_, labels)
  97.  
  98. #==============================================================================
  99. # # Below here is a minibatch version that doesn't seem to work - not sure why.
  100. # perm = np.random.permutation(len(sims))
  101. # sims, labels = sims[perm], labels[perm]
  102. #
  103. # batch_size = 32
  104. # tf.reset_default_graph()
  105. #
  106. # D_hat = 2
  107. # zhat = tf.get_variable('zhat', [N, D_hat],
  108. #                        initializer=tf.random_normal_initializer(0, 1.0))
  109. #            
  110. # target = tf.constant(sims.astype(np.float32), name='target')
  111. #
  112. # selection_i = tf.random_uniform([batch_size], 0, tf.shape(target)[0], tf.int32)
  113. # selection_j = tf.random_uniform([batch_size], 0, tf.shape(target)[0], tf.int32)
  114. # #selection_i = tf.range(0, batch_size)
  115. # #selection_j = tf.range(0, batch_size)
  116. #
  117. # z_i = tf.gather(zhat, selection_i)
  118. # z_j = tf.gather(zhat, selection_j)
  119. #
  120. # sess = tf.InteractiveSession()
  121. # sess.run(tf.global_variables_initializer())
  122. # zhat_ = sess.run(zhat)
  123. # p_hat_sub = tf.nn.sigmoid(tf.matmul(z_i, z_j, transpose_b=True))
  124. # t_sub = tf.gather(tf.transpose(tf.gather(target, selection_i)), selection_j)
  125. #
  126. # xent_mat = t_sub * tf.log(p_hat_sub) + (1 - t_sub) * tf.log(1 - p_hat_sub)
  127. # xent_sub = -tf.reduce_mean(xent_mat * -(tf.diag(tf.ones([batch_size]))-1))
  128. #
  129. #
  130. # train_op = tf.train.AdamOptimizer(0.01).minimize(xent_sub)
  131. #
  132. # sess = tf.InteractiveSession()
  133. # sess.run(tf.global_variables_initializer())
  134. #
  135. # zhat_ = sess.run(zhat)
  136. # categorical_scatter_2d(zhat_, labels)
  137. #
  138. # #%%
  139. # j = 0
  140. # for i in range(int(1000 / (batch_size / 500.))):
  141. #    
  142. #     if i % (int(100 / (batch_size / 500.))) == 0:
  143. #         print sess.run([xent_sub, train_op], {})[0]
  144. #     else:
  145. #         sess.run(train_op, {})
  146. #     j += 1
  147. #     if j >= 500 / batch_size:
  148. #         j = 0
  149. # zhat_ = sess.run(zhat)
  150. # categorical_scatter_2d(zhat_, labels)
  151. #
  152. #==============================================================================
Advertisement
Add Comment
Please, Sign In to add comment