|
1 | 1 | """Manipulating PDFs.""" |
2 | 2 |
|
| 3 | +import datetime |
| 4 | +from pathlib import Path |
| 5 | +import re |
3 | 6 | from typing import Iterable |
4 | 7 | from pypdf import PdfWriter, PdfReader |
| 8 | +import pdfplumber |
| 9 | + |
| 10 | +FMT = "%Y%m%d" |
5 | 11 |
|
6 | 12 |
|
7 | 13 | def extract_pages(file: str, subfiles: dict[str, int | Iterable[int]]) -> None: |
@@ -42,3 +48,45 @@ def _extract_pages( |
42 | 48 | writer.add_page(reader.pages[index]) |
43 | 49 | with open(output, "wb") as fout: |
44 | 50 | writer.write(fout) |
| 51 | + |
| 52 | + |
| 53 | +def extract_text_first_page(path: str | Path) -> str: |
| 54 | + """Extract the text of the first page of a PDF file. |
| 55 | +
|
| 56 | + :param path: The path of the PDF file. |
| 57 | + :return: The text of the first page. |
| 58 | + """ |
| 59 | + with pdfplumber.open(path) as pdf: |
| 60 | + page = pdf.pages[0] |
| 61 | + return page.extract_text() |
| 62 | + |
| 63 | + |
| 64 | +def _rename_puget_sound_energy(path: Path, text_first_page: str) -> Path: |
| 65 | + m = re.search(r"Issued: (\w+ \d{1,2}, \d{4})", text_first_page) |
| 66 | + date = datetime.datetime.strptime(m.group(1), "%B %d, %Y").strftime(FMT) |
| 67 | + path_new = path.with_name(f"pse_{date}.pdf") |
| 68 | + path.rename(path_new) |
| 69 | + return path_new |
| 70 | + |
| 71 | + |
| 72 | +def _rename_bellevue_water(path: Path, text_first_page: str) -> Path: |
| 73 | + m = re.search(r"Bill Date: (\d{1,2}/\d{1,2}/\d{4})", text_first_page) |
| 74 | + date = datetime.datetime.strptime(m.group(1), "%m/%d/%Y").strftime(FMT) |
| 75 | + path_new = path.with_name(f"bellevue_water_{date}.pdf") |
| 76 | + path.rename(path_new) |
| 77 | + return path_new |
| 78 | + |
| 79 | + |
| 80 | +def rename_auto(path: str | Path) -> Path: |
| 81 | + """Rename a PDF file automatically based on its content. |
| 82 | +
|
| 83 | + :param path: The path of the PDF file. |
| 84 | + :return: The path of the renamed PDF file. |
| 85 | + """ |
| 86 | + if isinstance(path, str): |
| 87 | + path = Path(path) |
| 88 | + text = extract_text_first_page(path) |
| 89 | + if "Puget Sound Energy" in text: |
| 90 | + return _rename_puget_sound_energy(path, text) |
| 91 | + if "MyUtilityBill.bellevuewa.gov" in text: |
| 92 | + return _rename_bellevue_water(path, text) |
0 commit comments