import pandas as pd

from src.kg_construction.llm_construct_kg import sort_format


def readXls(path):
    # 读取xls文件
    df = pd.read_excel(path)

    # 分隔
    #df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.split('+')
    #df.iloc[:, 2] = df.iloc[:, 2].astype(str).str.split('/')
    #df.iloc[:, 4] = df.iloc[:, 4].astype(str).str.split('/')
    #df.iloc[:, 6] = df.iloc[:, 6].astype(str).str.split('/')

    # 将DataFrame数据转换为字典列表
    data_list = df.to_dict('records')

    return data_list


def analyze_entity(data_list):
    set_post = set()
    set_job_category = set()
    set_company_industry = set()
    set_company_name = set()
    set_company_nature = set()
    set_city = set()
    for data in data_list:
        set_post.add(data['岗位名称'])
        set_job_category.add(data['职位类别'])
        set_company_industry.add(data['公司行业'])
        set_company_name.add(data['公司名称'])
        set_company_nature.add(data['公司性质'])
        set_city.add((data['城市']))
    return set_post, set_job_category, set_company_industry, set_company_name, set_company_nature, set_city

def set_become_dict_list(set_job):
    list_job = []
    for data in set_job:
        list_job.append({'entity':[sort_format(data)]})
    return list_job