Skip to content

Commit 0414cfd

Browse files
author
longendu
committed
auto rename pdf files
1 parent 3e06cb4 commit 0414cfd

File tree

4 files changed

+101
-3
lines changed

4 files changed

+101
-3
lines changed

aiutil/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""A utils Python package for data scientists."""
22

3-
__version__ = "0.86.1"
3+
__version__ = "0.87.0"

aiutil/pdf.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
"""Manipulating PDFs."""
22

3+
import datetime
4+
from pathlib import Path
5+
import re
36
from typing import Iterable
47
from pypdf import PdfWriter, PdfReader
8+
import pdfplumber
9+
10+
FMT = "%Y%m%d"
511

612

713
def extract_pages(file: str, subfiles: dict[str, int | Iterable[int]]) -> None:
@@ -42,3 +48,45 @@ def _extract_pages(
4248
writer.add_page(reader.pages[index])
4349
with open(output, "wb") as fout:
4450
writer.write(fout)
51+
52+
53+
def extract_text_first_page(path: str | Path) -> str:
54+
"""Extract the text of the first page of a PDF file.
55+
56+
:param path: The path of the PDF file.
57+
:return: The text of the first page.
58+
"""
59+
with pdfplumber.open(path) as pdf:
60+
page = pdf.pages[0]
61+
return page.extract_text()
62+
63+
64+
def _rename_puget_sound_energy(path: Path, text_first_page: str) -> Path:
65+
m = re.search(r"Issued: (\w+ \d{1,2}, \d{4})", text_first_page)
66+
date = datetime.datetime.strptime(m.group(1), "%B %d, %Y").strftime(FMT)
67+
path_new = path.with_name(f"pse_{date}.pdf")
68+
path.rename(path_new)
69+
return path_new
70+
71+
72+
def _rename_bellevue_water(path: Path, text_first_page: str) -> Path:
73+
m = re.search(r"Bill Date: (\d{1,2}/\d{1,2}/\d{4})", text_first_page)
74+
date = datetime.datetime.strptime(m.group(1), "%m/%d/%Y").strftime(FMT)
75+
path_new = path.with_name(f"bellevue_water_{date}.pdf")
76+
path.rename(path_new)
77+
return path_new
78+
79+
80+
def rename_auto(path: str | Path) -> Path:
81+
"""Rename a PDF file automatically based on its content.
82+
83+
:param path: The path of the PDF file.
84+
:return: The path of the renamed PDF file.
85+
"""
86+
if isinstance(path, str):
87+
path = Path(path)
88+
text = extract_text_first_page(path)
89+
if "Puget Sound Energy" in text:
90+
return _rename_puget_sound_energy(path, text)
91+
if "MyUtilityBill.bellevuewa.gov" in text:
92+
return _rename_bellevue_water(path, text)

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "aiutil"
3-
version = "0.86.1"
3+
version = "0.87.0"
44
description = "A utils Python package for data scientists."
55
authors = [{ name = "Benjamin Du", email = "[email protected]" }]
66
requires-python = ">=3.10,<3.14"
@@ -30,6 +30,7 @@ dependencies = [
3030
"paramiko>=3.2.0",
3131
"nbformat>=5.10.4",
3232
"nbconvert>=7.16.6",
33+
"pdfplumber>=0.11.7",
3334
]
3435

3536
[project.optional-dependencies]

uv.lock

Lines changed: 50 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)