Untitled

import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
import itertools as it
import sys
import copy
from math import ceil
import pickle

seed = 42
rd.seed(seed)
np.random.seed(seed)

def aux_passe(l):
    aux_l = copy.deepcopy(l)
    changed = False
    for i in range(len(aux_l)-1):
        if aux_l[i+1] < aux_l[i]:
            aux_l[i], aux_l[i+1] = aux_l[i+1], aux_l[i]
            changed = True
    return aux_l, changed

def check_begin(l):
    if len(l) < 2:
        return l
    else:
        if l[0] > l[1]:
            return [l[1]] + l
        else:
            return [l[0]] + l


Nc = 1000 # Number of content creators
Nv_max = 100 # Maximal number of videos per creators
M = 1000 # Maximal number of views per videos
avantage = 10 # r_1/r_0
e_per_view = 0.01 # r_0

p0 = 0.33 # Proportion of privileged content creators
q0 = 20/100 # Proportion of privileged content creators that are manipulated on the Section 6
alpha = 1.96 # z-score for confidence level 95%
prop_T = 0.0025 # proportion of the budget the regulator can request

# Set up for the figures
nbr_run = 50 # Number of runs per point
step_lies = 0.01
qqt_lies = np.arange(0, 1 + step_lies, step_lies)
max_passe = 0 # arbitrary constant to swapping manipulations

step_eps = 0.01
epsilon_range = np.arange(step_eps, 0.06+step_eps, step_eps)

step_beta = 0.001
beta_range = np.arange(step_beta, 1+step_beta, step_beta)

#### Experience of Section 5 on VideoSharingPlatform

def exp_2():
    ym = []
    yp = []
    ypasse = []
    Va = np.random.power(3., Nc)*Nv_max
    Va = list(map(int, Va))
    Va = [x if x !=0 else 1 for x in Va]
    Va.sort()
    avantaged = np.random.choice([1, 0], Nc, p = [p0, 1-p0])
    whosa = np.where(avantaged)[0]
    whosna = np.where(avantaged == 0)[0]

    features = ["videomaker", "popularity", "adv", "earnings"]

    to_del = []
    to_add = []
    for q in qqt_lies:
        to_del.append(np.random.choice(whosa, int(q*len(whosa)), replace = False))
        features.append("earnings_-" + str(int(q*len(whosa))))
        to_add.append(np.random.choice(whosna, int(q*len(whosna)), replace = False))
        features.append("earnings_+" + str(int(q*len(whosna))))

    data = [[v, np.sum([rd.randrange(M) for _ in range(Va[v])]), avantaged[v]] for v in range(Nc)]

    for v in range(Nc):
        _ ,p,a = data[v]
        v_earning = p*e_per_view*(1-a + avantage*a)
        data[v].append(v_earning)
        for l in range(len(qqt_lies)):
            al = (v in whosa) and not (v in to_del[l])
            data[v].append(p*e_per_view*(1-al + avantage*al))

            al = (v in to_add[l]) or (v in whosa)
            data[v].append(p*e_per_view*(1-al + avantage*al))
    df_hat = pd.DataFrame(data, columns = features)
    df_hat = df_hat.sort_values(by=['popularity'])
    earnings_l = df_hat["earnings"].tolist()
    swappe_e = []
    changed = True
    max_passe = 0
    bulled_features = []
    earnings_l = check_begin(earnings_l)

    while changed:
        bulled_features.append("bulle_"+str(max_passe))
        max_passe +=1
        earnings_l, changed = aux_passe(earnings_l)
        swappe_e.append(earnings_l[1:])
    swapped = np.array(swappe_e).transpose().tolist()
    df_hat = df_hat.join(pd.DataFrame(
                swapped,
                index=df_hat.index,
                columns = bulled_features
            ))
    def estimate(q_col):
        order = 0
        not_order = 0
        pop_list = df_hat["popularity"].tolist()
        q_col_list = df_hat[q_col].tolist()
        for i in range(Nc):
            for j in range(i+1, Nc):
                delta_e = float(pop_list[i]) - float(pop_list[j])
                delta_r = float(q_col_list[i]) - float(q_col_list[j])
                if delta_e*delta_r >= 0:
                    order += 1
                else:
                    not_order += 1
        return(order/(order + not_order))
    def estimate_limited_budget(q_col, T_part = 0.1):
        T = int(T_part * Nc*(Nc-1)/2)
        order = 0
        not_order = 0
        pop_list = df_hat["popularity"].tolist()
        q_col_list = df_hat[q_col].tolist()
        for _ in range(T):
            i,j = rd.randrange(Nc), rd.randrange(Nc)
            delta_e = float(pop_list[i]) - float(pop_list[j])
            delta_r = float(q_col_list[i]) - float(q_col_list[j])
            if delta_e*delta_r >= 0:
                order += 1
            else:
                not_order += 1
        return(order/(order + not_order))
    cur_ym = []
    cur_yp = []
    cur_ypasse = []
    for q in qqt_lies:
        cur_ym.append(estimate("earnings_-" + str(int(q*len(whosa)))))
        cur_yp.append(estimate("earnings_+" + str(int(q*len(whosna)))))
    for x in range(max_passe):
        cur_ypasse.append(estimate("bulle_"+str(x)))
    ym.append(cur_ym)
    yp.append(cur_yp)
    ypasse.append(cur_ypasse)

#### Experience of Section 6 on VideoSharingPlatform

def exp_3():
    y_tot = []
    x_tot = []
    def create_df(q0):
        Va = np.random.power(3., Nc)*Nv_max
        Va = list(map(int, Va))
        Va = [x if x !=0 else 1 for x in Va]
        Va.sort()
        avantaged = np.random.choice([1, 0], Nc, p = [p0, 1-p0])
        whosa = np.where(avantaged)[0]
        whosna = np.where(avantaged == 0)[0]

        features = ["videomaker", "nbr videos", "video", "views", "popularity", "adv", "earnings"]

        to_del = np.random.choice(whosa, int(q0*len(whosa)), replace = False)
        features.append("earnings_-" + str(int(q0*len(whosa))))
        to_add = np.random.choice(whosna, int(q0*len(whosna)), replace = False)
        features.append("earnings_+" + str(int(q0*len(whosna))))
        the_feature = "earnings_+" + str(int(q0*len(whosna)))
        data = []
        for v in range(Nc):
            views = [rd.randrange(M) for _ in range(Va[v])]
            sum_views = np.sum(views)
            for r in range(Va[v]):
                data.append([v, Va[v], r, views[r], sum_views, avantaged[v]])


        num_l = 0
        for v in range(Nc):
            for r in range(Va[v]):
                _ , _ , _, _, p,a = data[num_l]
                v_earning = p*e_per_view*(1-a + avantage*a)
                data[num_l].append(v_earning)
                al = (v in whosa) and not (v in to_del)
                data[num_l].append(p*e_per_view*(1-al + avantage*al))

                al = (v in to_add) or (v in whosa)
                data[num_l].append(p*e_per_view*(1-al + avantage*al))
                num_l += 1
        df_hat = pd.DataFrame(data, columns = features)

        return df_hat, Va, the_feature
    df_hat, Va, the_feature = create_df(q0)

    def calculate_tA(alpha, epsilon, Nc):
        TA = ceil(alpha**2*0.25/epsilon**2)
        if TA < Nc:
            return TA
        else:
            return ceil(TA/(1+(TA-1)/Nc))
    def calculate_tB(tA):
        return sum(Va[tA:])

    def lesstA(init_tA, init_tB):
        tA = init_tA
        tB = init_tB
        if tA == 0:
            tA += 1
            tB -= 1
        inconsistency = 0
        remainings_videaste = list(range(Nc))
        max_TA = init_tA
        how_many = 0
        while (tA+tB>0) and remainings_videaste != []:
            c = rd.choice(remainings_videaste)
            remainings_videaste.remove(c)
            tA -= 1
            how_many +=1
            cur_vid = list(range(Va[c]))
            estimated_e = 0
            while (tB>0) and cur_vid != []:
                v = rd.choice(cur_vid)
                cur_vid.remove(v)
                tB -= 1
                l = df_hat[(df_hat['videomaker'] == c) & (df_hat['video'] ==v) ]
                m, a, e = int(l["views"]), int(l["adv"]), int(l[the_feature])
                estimated_e += m*e_per_view*(1-a + avantage*a)
                if estimated_e > e:
                    inconsistency += 1
                    break
        return inconsistency

    def tAtBx(beta, k):
        tA = (beta + (1-beta)/(k+1))*tmax
        tB = k*(1-beta)/(k+1)*tmax
        return int(tA), int(tB), alpha*np.sqrt(0.25/tA)

    for beta in beta_range:
        tmax = ceil((sum(Va) + Nc)*prop_T)
        k = np.mean(Va)
        tA, tB, x = tAtBx(beta, k)
        x_tot.append(x)
        assert tA + tB <= tmax
        y = []
        for _ in range(nbr_run):
            res = lesstA(tA, tB)
            y.append(res)
        y_tot.append(y)


## Experience of Section 5 on IncomePredictor
'''
The code is not provided as it is a direct use of the notebook available at https://github.com/Trusted-AI/AIF360

As in the example, we keep the following features: 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'
The protected variable is: 'sex'
The variable to predict is: 'income-per-year'

The only additions from the paper are the following dictionaries and functions according to Barry III, H., and Harper, A. S. Three last letters identify most female first
names. Psychological reports 87, 1 (2000), 48–54.
'''

female_names = ['Ashley', 'Jessica', 'Amanda', 'Brittany', 'Samantha', 'Sarah', 'Lauren', 'Nicole', 'Megan', 'Stephanie', 'Emily', 'Jennifer', 'Elizabeth', 'Kayla', 'Rachel', 'Amber', 'Rebecca', 'Danielle', 'Chelsea', 'Alyssa', 'Melissa', 'Heather', 'Kelly', 'Christina', 'Michelle']
male_names = ['Michael', 'Matthew', 'Christopher', 'Joshua', 'Andrew', 'Joseph', 'John', 'Daniel', 'David', 'Robert', 'James', 'Justin', 'Nicholas', 'Anthony', 'William', 'Kyle', 'Zachary', 'Kevin', 'Tyler', 'Thomas', 'Eric', 'Brian', 'Brandon', 'Jonathan', 'Timothy']
dic = {"a": [38.1, 0.8],
      "e": [24.0 , 10.4],
      "i": [4.4, 0.6],
      "y": [12.3, 11.6],
      "h": [3.5, 2.9],
      "n": [12.8, 24.8],
      "l": [1.0, 8.3],
      "r": [1.0, 7.5],
      "s": [0.4, 7.1],
      "d": [0.4, 5.7],
      "o": [0, 4.1]}

def propose_name(s):
    if s == 1:
        return rd.choice(male_names)
    else:
        return rd.choice(female_names)

def proxy_sex(name):
    l = name[-1]
    if l in dic:
        weights = dic[l]
    else:
        weights = [2.1, 16.2]
    return rd.choices([0, 1], weights, k=1)[0]