|
@@ -0,0 +1,13 @@
|
|
|
+import pdfplumber
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+def readPdf(path, pageNumber):
|
|
|
+ with pdfplumber.open(path) as pdf:
|
|
|
+ content = ''
|
|
|
+ # 读取PDF文档第i+1页
|
|
|
+ page = pdf.pages[pageNumber]
|
|
|
+
|
|
|
+ # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
|
|
|
+ page_content = '\n'.join(page.extract_text().split('\n')[:-1])
|
|
|
+ content = content + page_content
|
|
|
+ return content
|