Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import matplotlib.pyplot as plt
- import praw
- import re
- LINK = 'https://www.reddit.com/r/FortCollins/comments/11mkrfu/salary_transparency_thread/'
- ID = '11mkrfu'
- SALARY_BUCKETS = [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000,
- 100000, 125000, 150000, 200000, 300000, 400000, 500000, 10000000]
- reddit = praw.Reddit('me')
- submission = reddit.submission(ID)
- salaries = list()
- for comment in submission.comments:
- for word in comment.body.split():
- # filter out age/sex:
- if re.search('[0-9]+[+]*[m|M|f|F]+', word):
- continue
- # filter out other odd answers:
- if word[-1] == ':' or word.find('hr') >= 0 or \
- word.find('hour') >= 0 or word.find('yrs') >= 0 or \
- word.find('YOE') >= 0:
- continue
- if re.search('[0-9]+[k|K]*', word):
- if word.find('k') >= 0 or word.find('K') >= 0:
- times1000 = True
- else:
- times1000 = False
- word = re.sub('[^0-9]', '', word)
- word = int(word)
- if times1000:
- word *= 1000
- # arbitrary bottom limit. Some entries may still sneak through
- # the above filters. If the salary is < $5k, throw it out
- if word < 5000:
- continue
- # ditto for a high limit:
- if word > 1000000:
- continue
- salaries.append(word)
- buckets = [0] * len(SALARY_BUCKETS)
- for salary in salaries:
- for idx, bucket in enumerate(SALARY_BUCKETS):
- if salary <= bucket:
- buckets[idx] += 1
- break
- #plt.scatter(salaries, [1]*len(salaries))
- #plt.show()
- labels = list()
- for idx, bucket in enumerate(SALARY_BUCKETS):
- if idx == 0:
- labels.append('\$0 to \${:.0f}k'.format(bucket / 1000))
- else:
- labels.append('\${:.0f}k to \${:.0f}k'.format(SALARY_BUCKETS[idx -1] / 1000,
- bucket / 1000))
- fig = plt.figure()
- plt.bar(labels, buckets, align='center', alpha=0.5)
- plt.xticks(rotation='vertical')
- fig.subplots_adjust(bottom=0.3)
- plt.ylabel('Number of respondents')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment