Compare commits
2 Commits
8a47ab1405
...
6cf8d03645
| Author | SHA1 | Date | |
|---|---|---|---|
| 6cf8d03645 | |||
|
|
3cc3dbe5d2 |
0
__sync_prod_db.py
Normal file
0
__sync_prod_db.py
Normal file
@@ -225,15 +225,21 @@ class DocumentSourceService:
|
|||||||
def _extract_content_from_pdf(pdf_path):
|
def _extract_content_from_pdf(pdf_path):
|
||||||
"""提取PDF内容"""
|
"""提取PDF内容"""
|
||||||
try:
|
try:
|
||||||
|
from flask import current_app
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
file_size = os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 0
|
||||||
|
current_app.logger.info(f'开始提取PDF内容: path={pdf_path}, size={file_size}')
|
||||||
reader = PdfReader(pdf_path)
|
reader = PdfReader(pdf_path)
|
||||||
content = ''
|
content = ''
|
||||||
for page in reader.pages:
|
for page in reader.pages:
|
||||||
page_content = page.extract_text()
|
page_content = page.extract_text()
|
||||||
if page_content:
|
if page_content:
|
||||||
content += page_content + '\n'
|
content += page_content + '\n'
|
||||||
|
current_app.logger.info(f'PDF内容提取完成: path={pdf_path}, pages={len(reader.pages)}, content_length={len(content)}')
|
||||||
return content
|
return content
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
from flask import current_app
|
||||||
|
current_app.logger.exception(f'PDF内容提取失败: path={pdf_path}, error={str(e)}')
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -269,7 +275,8 @@ class DocumentSourceService:
|
|||||||
# 提取PDF内容
|
# 提取PDF内容
|
||||||
content = DocumentSourceService._extract_content_from_pdf(pdf_path)
|
content = DocumentSourceService._extract_content_from_pdf(pdf_path)
|
||||||
if not content:
|
if not content:
|
||||||
failed_docs.append({'documentId': doc_id, 'error': 'PDF内容为空'})
|
file_size = os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 0
|
||||||
|
failed_docs.append({'documentId': doc_id, 'error': f'PDF内容为空,文件大小:{file_size} bytes。请检查服务器是否安装PyPDF2、文件是否为扫描件或加密PDF'})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 更新文档内容
|
# 更新文档内容
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ PyMySQL~=0.10.0
|
|||||||
psycopg2-binary~=2.9.9
|
psycopg2-binary~=2.9.9
|
||||||
python-jenkins~=1.7.0
|
python-jenkins~=1.7.0
|
||||||
requests~=2.26.0
|
requests~=2.26.0
|
||||||
|
openai~=1.30.0
|
||||||
|
httpx~=0.27.0
|
||||||
|
PyPDF2~=3.0.1
|
||||||
Flask-Docs~=0.6.4
|
Flask-Docs~=0.6.4
|
||||||
flask_redis~=0.4.0
|
flask_redis~=0.4.0
|
||||||
jira~=3.0.1
|
jira~=3.0.1
|
||||||
Binary file not shown.
BIN
uploads/智慧运营/智慧运营V2.0/20260514174912-SZ采购工作台V2_0PRD-f77aac9e.pdf
Normal file
BIN
uploads/智慧运营/智慧运营V2.0/20260514174912-SZ采购工作台V2_0PRD-f77aac9e.pdf
Normal file
Binary file not shown.
BIN
uploads/智慧运营/智慧运营V2.0/20260514174912-回货单流程-aa48090c.pdf
Normal file
BIN
uploads/智慧运营/智慧运营V2.0/20260514174912-回货单流程-aa48090c.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user