[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 772bf078c5cb · 2025-10-27T17:15:12.000Z
for more information, see https://pre-commit.ci
diff --git a/code_of_conduct.md b/code_of_conduct.md
@@ -1,5 +1,27 @@
 # Contributor Covenant Code of Conduct
 
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Our Pledge](#our-pledge)
+- [Our Standards](#our-standards)
+- [Enforcement Responsibilities](#enforcement-responsibilities)
+- [Scope](#scope)
+- [Enforcement](#enforcement)
+- [Enforcement Guidelines](#enforcement-guidelines)
+  - [1. Correction](#1-correction)
+  - [2. Warning](#2-warning)
+  - [3. Temporary Ban](#3-temporary-ban)
+  - [4. Permanent Ban](#4-permanent-ban)
+- [Attribution](#attribution)
+
+______________________________________________________________________
+
+<!--TOC-->
+
 ## Our Pledge
 
 We as members, contributors, and leaders pledge to make participation in our
diff --git a/data/tabular/ld50_catmos/meta.yaml b/data/tabular/ld50_catmos/meta.yaml
diff --git a/data/tabular/mona/example_processing_and_templates.ipynb b/data/tabular/mona/example_processing_and_templates.ipynb
@@ -20,7 +20,6 @@
     "from tqdm import tqdm\n",
     "\n",
     "# import datasets\n",
-    "import rdkit\n",
     "import rdkit.Chem as Chem\n",
     "import rdkit.RDLogger as RDLogger"
    ]
@@ -1444,7 +1443,7 @@
     "                k = md[\"name\"]\n",
     "                v = md.get(\"value\", np.nan)\n",
     "                df_row[\"md_\" + transform_key(k)] = v\n",
-    "                if not (v is np.nan):\n",
+    "                if v is not np.nan:\n",
     "                    md_keys.append(k)\n",
     "            md_key_counter.update(md_keys)\n",
     "            compounds = entry.get(\"compound\", [])\n",
diff --git a/data/tabular/ocp/transform.py b/data/tabular/ocp/transform.py
@@ -21,8 +21,8 @@ def uniCode2Latex(text: str) -> str:
         text = text.replace(chr(code), f"$_{code-8320}$")
 
     text = text.replace("\u0305", "$^-$")
-    text = text.replace("\u207A", "$^+$")
-    text = text.replace("\u207B", "$^-$")
+    text = text.replace("\u207a", "$^+$")
+    text = text.replace("\u207b", "$^-$")
     text = text.replace("\u2074", "$^4$")
     text = text.replace("\u2070", "$^0$")
     text = text.replace("\u2078", "$^1$")
diff --git a/data/tabular/orbnet_denali/develop_transform.ipynb b/data/tabular/orbnet_denali/develop_transform.ipynb
@@ -25,11 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pathlib import Path\n",
     "from rdkit import Chem\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import os\n",
     "import pandas as pd\n",
     "from glob import glob"
    ]
@@ -474,7 +470,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from rdkit.Chem import rdDetermineBonds\n",
     "from chemnlp.utils import xyz_to_mol"
    ]
   },
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -1,5 +1,23 @@
 # Contributing to ChemNLP
 
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Getting Started](#getting-started)
+- [Implementing a Dataset](#implementing-a-dataset)
+  - [meta.yaml Structure](#metayaml-structure)
+  - [transform.py Guidelines](#transformpy-guidelines)
+- [Text Templates](#text-templates)
+- [Testing Your Contribution](#testing-your-contribution)
+- [Submitting Your Contribution](#submitting-your-contribution)
+
+______________________________________________________________________
+
+<!--TOC-->
+
 Thank you for your interest in contributing to ChemNLP! There are many ways to contribute, including implementing datasets, improving code, and enhancing documentation.
 
 ## Getting Started
@@ -17,7 +35,6 @@ One of the most valuable contributions is implementing a dataset. Here's how to
 1. Choose a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) or add a new one there.
 2. Create an issue in this repository stating your intention to add the dataset.
 3. Make a Pull Request (PR) that adds a new folder in `data` with the following files:
-
    - `meta.yaml`: Describes the dataset (see structure below).
    - `transform.py`: Python code to transform the original dataset into a usable form.
 
diff --git a/docs/api/meta_yaml_augmentor.md b/docs/api/meta_yaml_augmentor.md
@@ -1,5 +1,25 @@
 # Meta YAML Augmenter
 
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Overview](#overview)
+- [generate_augmented_meta_yaml](#generate_augmented_meta_yaml)
+- [CLI Interface](#cli-interface)
+  - [Usage](#usage)
+  - [Arguments](#arguments)
+  - [Example](#example)
+- [Augmentation Process](#augmentation-process)
+- [Notes](#notes)
+- [Example Usage in Python](#example-usage-in-python)
+
+______________________________________________________________________
+
+<!--TOC-->
+
 ## Overview
 
 The Meta YAML Augmenter is a tool designed to enhance existing `meta.yaml` files for chemical datasets. It uses Large Language Models (LLMs) to generate additional templates and improve the metadata structure, particularly focusing on advanced sampling methods and template formats.
diff --git a/docs/api/meta_yaml_generator.md b/docs/api/meta_yaml_generator.md
@@ -1,5 +1,19 @@
 # Meta YAML Generator
 
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Overview](#overview)
+- [`generate_meta_yaml`](#generate_meta_yaml)
+- [Usage Example](#usage-example)
+
+______________________________________________________________________
+
+<!--TOC-->
+
 ## Overview
 
 The Meta YAML Generator is a tool designed to automatically create a `meta.yaml` file for chemical datasets using Large Language Models (LLMs). It analyzes the structure of a given DataFrame and generates a comprehensive metadata file, including advanced sampling methods and template formats.
diff --git a/docs/api/sampler.md b/docs/api/sampler.md
@@ -1,5 +1,28 @@
 # Sampler Module
 
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Overview](#overview)
+- [TemplateSampler](#templatesampler)
+  - [Class: TemplateSampler](#class-templatesampler)
+    - [Initialization](#initialization)
+    - [Configuration Options](#configuration-options)
+    - [Main Methods](#main-methods)
+      - [`sample`](#sample)
+      - [`enable_class_balancing`](#enable_class_balancing)
+      - [`disable_class_balancing`](#disable_class_balancing)
+    - [Identifier Wrapping](#identifier-wrapping)
+    - [Usage Examples](#usage-examples)
+- [Notes](#notes)
+
+______________________________________________________________________
+
+<!--TOC-->
+
 ## Overview
 
 The `sampler` module provides functionality for generating text samples based on templates and data. It is primarily used for creating datasets for natural language processing tasks in chemistry and related fields. The main class in this module is `TemplateSampler`, which allows for flexible text generation with support for multiple choice questions, class balancing, and identifier wrapping.
diff --git a/docs/api/sampler_cli.md b/docs/api/sampler_cli.md
@@ -1,5 +1,31 @@
 # Sampler CLI
 
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Overview](#overview)
+- [Usage](#usage)
+  - [Arguments](#arguments)
+  - [Options](#options)
+- [Detailed Option Descriptions](#detailed-option-descriptions)
+  - [`chunksize`](#chunksize)
+  - [`class_balanced`](#class_balanced)
+  - [`benchmarking`](#benchmarking)
+  - [`multiple_choice`](#multiple_choice)
+  - [`additional_templates`](#additional_templates)
+  - [`use_standard_templates`](#use_standard_templates)
+  - [`wrap_identifiers`](#wrap_identifiers)
+- [Examples](#examples)
+- [Notes](#notes)
+- [Troubleshooting](#troubleshooting)
+
+______________________________________________________________________
+
+<!--TOC-->
+
 ## Overview
 
 The Sampler CLI is a command-line interface tool designed to process chemical datasets using the `TemplateSampler`. It allows for flexible text generation based on templates, with support for various sampling scenarios including class balancing, benchmarking, and multiple-choice questions.
diff --git a/experiments/README.md b/experiments/README.md
@@ -1,5 +1,18 @@
 # Working with the Stability cluster
 
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [GPT-Neox](#gpt-neox)
+- [Hugging Face](#hugging-face)
+
+______________________________________________________________________
+
+<!--TOC-->
+
 We currently run our large scale experiments on the Stability AI HPC cluster.
 This subdirectory features a few helpful scripts that can help you get up and
 running on the cluster.
@@ -11,7 +24,6 @@ running on the cluster.
 
 1. [Create Environment](scripts/env_creation_neox.sh) -
    creates a basic conda environment for experiments.
-
    - Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
      > Using the positional argument passed into the script
    - Clones `chemnlp` into your personal cluster `USER` directory.
@@ -28,7 +40,6 @@ running on the cluster.
 
 2. [Training Models](scripts/sbatch_train_neox.sh) -
    runs a GPT-NeoX training pipeline
-
    - creates a conda environment using the `env_creation_neox.sh` script.
    - runs the GPT-NeoX `train.py` script using the user configuration
      > as GPT-NeoX configurations can be combined, the PEFT configurations are held
@@ -48,7 +59,6 @@ running on the cluster.
 
 1. [Create Environment](scripts/env_creation_hf.sh) -
    creates a basic conda environment for experiments.
-
    - Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
      > Using the positional argument passed into the script
    - Clones `chemnlp` into your personal cluster `USER` directory.
@@ -65,7 +75,6 @@ running on the cluster.
 
 2. [Single Node Models](scripts/sbatch_train_hf.sh) -
    runs a Hugging Face training pipeline across devices
-
    - creates a conda environment using the `env_creation_hf.sh` script.
    - runs the Hugging Face `run_tune.py` script with the user configuration
 
@@ -81,7 +90,6 @@ running on the cluster.
 
 3. [Multi Node Models](scripts/sbatch_train_hf_multinode.sh) -
    runs a Hugging Face training pipeline across nodes
-
    - creates a conda environment using the `env_creation_hf.sh` script.
    - runs the Hugging Face `run_tune.py` script with the user configuration
 
@@ -97,7 +105,6 @@ running on the cluster.
 
 4. [Grid Search](scripts/run_grid_search.py) -
    runs a grid search across training pipeline configuration options
-
    - Update the upper-case parameters at the top of the script
    - The script runs an exhaustive set of experiments across all permutations
 
diff --git a/experiments/ablations/continued_pretrain.py b/experiments/ablations/continued_pretrain.py
@@ -57,7 +57,13 @@ def load_model(
 
 
 def train(
-    model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None
+    model,
+    tokenizer,
+    dataset,
+    run_name: str,
+    batch_size: int = 64,
+    max_seq_length=2048,
+    eval_dataset=None,
 ):
     wandb.init(project="chemnlp-ablations", name=run_name)
     trainer = UnslothTrainer(
@@ -83,8 +89,8 @@ def train(
             lr_scheduler_type="linear",
             seed=3407,
             output_dir=f"outputs_{run_name}",
-            eval_strategy = 'steps' if eval_dataset is not None else 'no',
-            eval_steps = 10_000 if eval_dataset is not None else None
+            eval_strategy="steps" if eval_dataset is not None else "no",
+            eval_steps=10_000 if eval_dataset is not None else None,
         ),
     )
 
@@ -138,9 +144,18 @@ def run(
     )
 
     dataset = create_dataset(tokenizer, data_files)
-    eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None
+    eval_dataset = (
+        create_dataset(tokenizer, eval_data_files) if eval_data_files else None
+    )
 
-    train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset)
+    train(
+        model,
+        tokenizer,
+        dataset,
+        run_name,
+        batch_size=batch_size,
+        eval_dataset=eval_dataset,
+    )
 
 
 if __name__ == "__main__":
diff --git a/experiments/configs/data_configs/hf_data.yml b/experiments/configs/data_configs/hf_data.yml
@@ -1,7 +1,7 @@
 model_name: "EleutherAI/pythia-1b"
 context_length: 2048
 dataset_name: "EleutherAI/pile"
-dataset_args: {"name": "pubmed", "split": "train"}
+dataset_args: { "name": "pubmed", "split": "train" }
 batch_size: 1
 string_key: "text"
 save_path: "/fsx/proj-chemnlp/data/example_tokenised"