databricks/lib/spark_helper/files.py (26 lines of code) (raw):
from io import BytesIO
from typing import Any, Dict
from lib.spark_helper.storage_service import SparkStorageService
from pypdf import PdfReader
def extract_text_from_pdf(pdf_content: bytes) -> str:
reader = PdfReader(BytesIO(pdf_content))
text = ""
for page in reader.pages:
text += page.extract_text()
return text
class FilesStorage:
VOLUME_NAME = "files"
TXT_STORAGE_PATH = VOLUME_NAME + "/{file_id}/{file_id}.txt"
PDF_STORAGE_PATH = VOLUME_NAME + "/{file_id}/{file_id}.pdf"
def __init__(self, configs: Dict[str, Any]) -> None:
self.storage_service = SparkStorageService(configs)
self.storage_service.create_volume_if_not_exists(self.VOLUME_NAME)
def store_pdf(self, pdf_content: bytes, file_id: int) -> str:
pdf_file_path = self.PDF_STORAGE_PATH.format(file_id=file_id)
self.storage_service.write_binary(pdf_content, pdf_file_path)
return pdf_file_path
def store_text(self, pdf_content: bytes, file_id: int) -> str:
pdf_text = extract_text_from_pdf(pdf_content)
txt_file_path = self.TXT_STORAGE_PATH.format(file_id=file_id)
self.storage_service.write_text(pdf_text, txt_file_path)
return txt_file_path