SHARE
TWEET

Untitled

a guest Jun 24th, 2019 84 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. df = pd.DataFrame({'countries':['US','UK','Germany','China']})
  2. countries = ['UK','China']
  3.  
  4. # pseudo-code:
  5. df[df['countries'] not in countries]
  6.      
  7. df = pd.DataFrame({'countries':['US','UK','Germany','China']})
  8. countries = pd.DataFrame({'countries':['UK','China'], 'matched':True})
  9.  
  10. # IN
  11. df.merge(countries,how='inner',on='countries')
  12.  
  13. # NOT IN
  14. not_in = df.merge(countries,how='left',on='countries')
  15. not_in = not_in[pd.isnull(not_in['matched'])]
  16.      
  17. >>> df
  18.   countries
  19. 0        US
  20. 1        UK
  21. 2   Germany
  22. 3     China
  23. >>> countries
  24. ['UK', 'China']
  25. >>> df.countries.isin(countries)
  26. 0    False
  27. 1     True
  28. 2    False
  29. 3     True
  30. Name: countries, dtype: bool
  31. >>> df[df.countries.isin(countries)]
  32.   countries
  33. 1        UK
  34. 3     China
  35. >>> df[~df.countries.isin(countries)]
  36.   countries
  37. 0        US
  38. 2   Germany
  39.      
  40. In [5]: df.query("countries in @countries")
  41. Out[5]:
  42.   countries
  43. 1        UK
  44. 3     China
  45.  
  46. In [6]: df.query("countries not in @countries")
  47. Out[6]:
  48.   countries
  49. 0        US
  50. 2   Germany
  51.      
  52. criterion = lambda row: row['countries'] not in countries
  53. not_in = df[df.apply(criterion, axis=1)]
  54.      
  55. dfbc = dfbc[~dfbc['BUSINESS_ID'].isin(dfProfilesBusIds['BUSINESS_ID'])]
  56.      
  57. ╒════════╤══════════════════════╤══════════════════════╕
  58. │        │ Python               │ Pandas               │
  59. ╞════════╪══════════════════════╪══════════════════════╡
  60. │ in     │ item in sequence     │ sequence.isin(item)  │
  61. ├────────┼──────────────────────┼──────────────────────┤
  62. │ not in │ item not in sequence │ ~sequence.isin(item) │
  63. ╘════════╧══════════════════════╧══════════════════════╛
  64.      
  65. df = pd.DataFrame({'countries': ['US', 'UK', 'Germany', np.nan, 'China']})
  66. df
  67.   countries
  68. 0        US
  69. 1        UK
  70. 2   Germany
  71. 3     China
  72.  
  73. c1 = ['UK', 'China']             # list
  74. c2 = {'Germany'}                 # set
  75. c3 = pd.Series(['China', 'US'])  # Series
  76. c4 = np.array(['US', 'UK'])      # array
  77.      
  78. df['countries'].isin(c1)
  79.  
  80. 0    False
  81. 1     True
  82. 2    False
  83. 3    False
  84. 4     True
  85. Name: countries, dtype: bool
  86.  
  87. # `in` operation
  88. df[df['countries'].isin(c1)]
  89.  
  90.   countries
  91. 1        UK
  92. 4     China
  93.  
  94. # `not in` operation
  95. df[~df['countries'].isin(c1)]
  96.  
  97.   countries
  98. 0        US
  99. 2   Germany
  100. 3       NaN
  101.      
  102. # Filter with `set` (tuples work too)
  103. df[df['countries'].isin(c2)]
  104.  
  105.   countries
  106. 2   Germany
  107.      
  108. # Filter with another Series
  109. df[df['countries'].isin(c3)]
  110.  
  111.   countries
  112. 0        US
  113. 4     China
  114.      
  115. # Filter with array
  116. df[df['countries'].isin(c4)]
  117.  
  118.   countries
  119. 0        US
  120. 1        UK
  121.      
  122. df2 = pd.DataFrame({
  123.     'A': ['x', 'y', 'z', 'q'], 'B': ['w', 'a', np.nan, 'x'], 'C': np.arange(4)})
  124. df2
  125.  
  126.    A    B  C
  127. 0  x    w  0
  128. 1  y    a  1
  129. 2  z  NaN  2
  130. 3  q    x  3
  131.  
  132. c1 = ['x', 'w', 'p']
  133.      
  134. df2[['A', 'B']].isin(c1)
  135.  
  136.       A      B
  137. 0   True   True
  138. 1  False  False
  139. 2  False  False
  140. 3  False   True
  141.      
  142. df2[['A', 'B']].isin(c1).any(axis=1)
  143.  
  144. 0     True
  145. 1    False
  146. 2    False
  147. 3     True
  148. dtype: bool
  149.  
  150. df2[df2[['A', 'B']].isin(c1).any(axis=1)]
  151.  
  152.    A  B  C
  153. 0  x  w  0
  154. 3  q  x  3
  155.      
  156. df2[df2[['A', 'B']].isin(c1).all(axis=1)]
  157.  
  158.    A  B  C
  159. 0  x  w  0
  160.      
  161. # `in` operation
  162. df[np.isin(df['countries'], c1)]
  163.  
  164.   countries
  165. 1        UK
  166. 4     China
  167.  
  168. # `not in` operation
  169. df[np.isin(df['countries'], c1, invert=True)]
  170.  
  171.   countries
  172. 0        US
  173. 2   Germany
  174. 3       NaN
  175.      
  176. c1_set = set(c1) # Using `in` with `sets` is a constant time operation...
  177.                  # This doesn't matter for pandas because the implementation differs.
  178. # `in` operation
  179. df[[x in c1_set for x in df['countries']]]
  180.  
  181.   countries
  182. 1        UK
  183. 4     China
  184.  
  185. # `not in` operation
  186. df[[x not in c1_set for x in df['countries']]]
  187.  
  188.   countries
  189. 0        US
  190. 2   Germany
  191. 3       NaN
  192.      
  193. df = pd.DataFrame({'countries':['US','UK','Germany','China']})
  194. countries = ['UK','China']
  195.      
  196. df[df.countries.isin(countries)]
  197.      
  198. df[df.countries.isin([x for x in np.unique(df.countries) if x not in countries])]
  199.      
  200. def subset(df, query="", select=("")):
  201.     query = str(query)
  202.     query = query.replace("|", "¤")
  203.     query = query.replace("&", "½")
  204.     query = query.replace(" ", "")
  205.     queries = re.split('¤|½', query)
  206.  
  207.     # the %!in% and %in% operator will have to be evaluated after
  208.     in_operator = ["%in%", "%!in%"]
  209.     queries_lc = [q for q in queries if any(x in q for x in in_operator)]
  210.     queries = [q for q in queries if not any(x in q for x in in_operator)]
  211.  
  212.     if len(select) == 0:select = df.columns
  213.     query = "".join(queries)
  214.     query = query.replace("¤", "|")
  215.     query = query.replace("½", "&")
  216.  
  217.     if len(queries_lc) > 0:
  218.         for lc_q in queries_lc:
  219.  
  220.             if in_operator[0] in lc_q:  # %in%
  221.                 var, list_con = re.split(in_operator[0], lc_q)
  222.                 globals()["list_condition_used_in_subset"] = eval(list_con)
  223.                 df = df[df[var].isin(list_condition_used_in_subset)]
  224.  
  225.             else:  # %!in% - not in
  226.                 var, list_con = re.split(in_operator[1], lc_q)
  227.                 globals()["list_condition_used_in_subset"] = eval(list_con)
  228.                 df = df[~df[var].isin(list_condition_used_in_subset)]
  229.  
  230.     if len(queries) == 0 and len(queries_lc) > 0: df = df[select]  # if only a list condition query
  231.     else:df = pd.DataFrame(df.query(query)[select])  # perform query and return selected - normal thing
  232.  
  233.     return df
  234.  
  235. df = pd.DataFrame({'countries':['US','UK','Germany','China'],"GDP":[1,2,3,4]})
  236. countries = ['UK','China']
  237. subset(df,query="countries %in% countries & GDP > 2")
  238.  
  239.   countries  GDP
  240. 3     China    4
  241.  
  242. subset(df,query="countries %!in% countries",select=["GDP"])
  243.  
  244.    GDP
  245. 0    1
  246. 2    3
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top