123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- import pandas as pd
- from src.kg_construction.llm_construct_kg import sort_format
- def readXls(path):
- # 读取xls文件
- df = pd.read_excel(path)
- # 分隔
- #df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.split('+')
- #df.iloc[:, 2] = df.iloc[:, 2].astype(str).str.split('/')
- #df.iloc[:, 4] = df.iloc[:, 4].astype(str).str.split('/')
- #df.iloc[:, 6] = df.iloc[:, 6].astype(str).str.split('/')
- # 将DataFrame数据转换为字典列表
- data_list = df.to_dict('records')
- return data_list
- def analyze_entity(data_list):
- set_post = set()
- set_job_category = set()
- set_company_industry = set()
- set_company_name = set()
- set_company_nature = set()
- set_city = set()
- for data in data_list:
- set_post.add(data['岗位名称'])
- set_job_category.add(data['职位类别'])
- set_company_industry.add(data['公司行业'])
- set_company_name.add(data['公司名称'])
- set_company_nature.add(data['公司性质'])
- set_city.add((data['城市']))
- return set_post, set_job_category, set_company_industry, set_company_name, set_company_nature, set_city
- def set_become_dict_list(set_job):
- list_job = []
- for data in set_job:
- list_job.append({'entity':[sort_format(data)]})
- return list_job
|