analyze_pdf.py 442 B

12345678910111213
  1. import pdfplumber
  2. import pandas as pd
  3. def readPdf(path, pageNumber):
  4. with pdfplumber.open(path) as pdf:
  5. content = ''
  6. # 读取PDF文档第i+1页
  7. page = pdf.pages[pageNumber]
  8. # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
  9. page_content = '\n'.join(page.extract_text().split('\n')[:-1])
  10. content = content + page_content
  11. return content