analyze_xls.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import pandas as pd
  2. from src.kg_construction.llm_construct_kg import sort_format
  3. def readXls(path):
  4. # 读取xls文件
  5. df = pd.read_excel(path)
  6. # 分隔
  7. #df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.split('+')
  8. #df.iloc[:, 2] = df.iloc[:, 2].astype(str).str.split('/')
  9. #df.iloc[:, 4] = df.iloc[:, 4].astype(str).str.split('/')
  10. #df.iloc[:, 6] = df.iloc[:, 6].astype(str).str.split('/')
  11. # 将DataFrame数据转换为字典列表
  12. data_list = df.to_dict('records')
  13. return data_list
  14. def analyze_entity(data_list):
  15. set_post = set()
  16. set_job_category = set()
  17. set_company_industry = set()
  18. set_company_name = set()
  19. set_company_nature = set()
  20. set_city = set()
  21. for data in data_list:
  22. set_post.add(data['岗位名称'])
  23. set_job_category.add(data['职位类别'])
  24. set_company_industry.add(data['公司行业'])
  25. set_company_name.add(data['公司名称'])
  26. set_company_nature.add(data['公司性质'])
  27. set_city.add((data['城市']))
  28. return set_post, set_job_category, set_company_industry, set_company_name, set_company_nature, set_city
  29. def set_become_dict_list(set_job):
  30. list_job = []
  31. for data in set_job:
  32. list_job.append({'entity':[sort_format(data)]})
  33. return list_job