from?urllib.request?import?urlopen
from?pdfminer.pdfparser?import?PDFParser,?PDFDocument
from?pdfminer.pdfinterp?import?PDFResourceManager,?PDFPageInterpreter
from?pdfminer.pdfdevice?import?PDFDevice
from?pdfminer.converter?import?PDFPageAggregator
from?pdfminer.layout?import?LTTextBoxHorizontal,?LAParams
#?獲取文檔
#?fp?=?open("naacl06-shinyama.pdf",?'rb')
fp?=?urlopen('https://www.tencent.com/zh-cn/articles/802741466496787.pdf')
#?創(chuàng)建解釋器
parser?=?PDFParser(fp)
#?PDF文檔對(duì)象
doc?=?PDFDocument()
#?連接解釋器和文檔對(duì)象
parser.set_document(doc)
doc.set_parser(parser)
#?初始化文檔
doc.initialize()
#?創(chuàng)建PDF資源管理器
resource?=?PDFResourceManager()
#?創(chuàng)建一個(gè)PDF參數(shù)分析器
laparam?=?LAParams()
#?創(chuàng)建聚合器
device?=?PDFPageAggregator(resource,?laparams=laparam)
#?創(chuàng)建PDF頁(yè)面解析器
interpreter?=?PDFPageInterpreter(resource,?device)
#?循環(huán)遍歷列表,每次處理一頁(yè)的內(nèi)容
#?doc.get_pages()?獲取page列表
for?page?in?doc.get_pages():
????#?使用頁(yè)面解釋器來(lái)讀取
????interpreter.process_page(page)
????#?使用聚合器獲得內(nèi)容
????layout?=?device.get_result()
????for?out?in?layout:
????????if?hasattr(out,?'get_text'):
????????????print(out.get_text())
