Untitled

#! /usr/bin/env python3
# _*_ utf-8 _*_
# __author__: "gcy"
"""
    输入：一个xls格式文件
    输出：类别字段中 记者数，top12 和 next30的数量，以及的数量"
    实现的功能
"""

import pandas as pd

excelFilePath = r"F:\test.xls"
# TOP12 next30列表 和其各自的统计变量
top12Count = 0
top12 = ["AFP", "AP", "BBC", "Bloomberg", "CNBC", "CNN", "Forbes", "Fortune", "FT", "Media Title", "New York Times",
         "Reuters", "Wall Street Journal"]

next30Count = 0
next30 = ["ANSA", "Huffington Post", "Sky News", "Business Insider", "Le Monde", "SF Chronicle", "Cheddar", "Mashable",
          "The Guardian", "Daily Mail", "Mid-East Eco Review", "The Independent", "Der Spiegel", "Nikkei Asian Review ",
          "The Economist", "DPA", "Press Trust of India", "The Information", "EFE", "Quartz", "Times of India",
          "Fast Company", "Re/Code, The Verge", "USA TODAY", "Gulf News", "RFI", "Washington Post",
          "Harvard Business Review", "SCMP", "Wired"]

# 读取xls转成一个dataframe
df = pd.DataFrame(pd.read_excel(excelFilePath))

# 保留所需要的字段
dfNeed = df[['姓名', '性别', '所在国家', '职务', '机构', '简介', '领域', '类别', '电子邮件']]

# 将保留的字段导出一个新的文件
needFieldExcelPath = excelFilePath.split(".")[0] + "_needField.xlsx"
# dfNeed.to_excel(needFieldExcelPath, index=False)
# !待完成python + openpyxl 输出Excel单元格大小自适应和填充

reporterCount = 0
# ??判断逻辑的问题  记者是如果判断的-->类别中含有记者就是记者？还是类别中只有记者才是记者
print(df["电子邮件"].count())
for i in range(df["电子邮件"].count()):
    if df["所在国家"][i] != "德国" and "记者" in str(df["类别"][i]):
        reporterCount = reporterCount + 1
        if df["机构"][i] in top12:
            top12Count = top12Count + 1
        if df["机构"][i] in next30:
            next30Count = next30Count + 1

msg = "公开邮箱人数：" + str(df["电子邮件"].count()) + "人，其中记者" + str(reporterCount) + "人" \
                                                                            "（其中TOP12  " + str(
    top12Count) + "人，NEXT30 " + str(next30Count) + "人）；专家和KOL " + str(df["电子邮件"].count() - reporterCount) + "人；"

print(msg)