Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/src/bioscript/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from .biovault import (
BioVaultPipeline,
BioVaultProject,
DatasetInput,
PipelineStep,
SQLStore,
TypeExpr,
create_bioscript_project,
export_bioscript_pipeline,
export_bioscript_workflow,
Expand Down Expand Up @@ -47,8 +49,10 @@
# BioVault integration
"BioVaultPipeline",
"BioVaultProject",
"DatasetInput",
"PipelineStep",
"SQLStore",
"TypeExpr",
"create_bioscript_project",
"export_bioscript_pipeline",
"export_bioscript_workflow",
Expand Down
81 changes: 81 additions & 0 deletions python/src/bioscript/biovault.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class TypeExpr(str, Enum):
PARTICIPANT_SHEET = "ParticipantSheet"
GENOTYPE_RECORD = "GenotypeRecord"
BIOVAULT_CONTEXT = "BiovaultContext"
DATASET = "Dataset" # Remote dataset via beaver Twin
DATASET_TWIN = "DatasetTwin" # Twin object from dataset

@staticmethod
def list_of(inner: str) -> str:
Expand Down Expand Up @@ -128,6 +130,85 @@ def from_dict(cls, d: Dict[str, Any]) -> Input:
)


@dataclass
class DatasetInput:
"""A dataset input specification for loading remote datasets via beaver.

Datasets are loaded using beaver's DatasetRegistry and returned as Twin objects
for privacy-preserving analysis.

Example:
>>> dataset = DatasetInput(
... name="patient_genomics",
... owner="[email protected]",
... dataset_name="single_cell_rnaseq",
... asset_key="sc_data", # Optional: specific asset within dataset
... description="Patient single-cell RNA-seq data"
... )
"""

name: str
owner: str
dataset_name: str
description: str
asset_key: Optional[str] = None # Specific asset within the dataset

def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for YAML serialization."""
d = {
"name": self.name,
"type": "Dataset",
"owner": self.owner,
"dataset_name": self.dataset_name,
"description": self.description,
}
if self.asset_key:
d["asset_key"] = self.asset_key
return d

@classmethod
def from_dict(cls, d: Dict[str, Any]) -> DatasetInput:
"""Create from dictionary."""
return cls(
name=d["name"],
owner=d["owner"],
dataset_name=d["dataset_name"],
description=d["description"],
asset_key=d.get("asset_key"),
)

def load(self, context: Any = None) -> Any:
"""
Load the dataset using beaver's DatasetRegistry.

Args:
context: Optional BeaverContext. If not provided, uses active context.

Returns:
Twin object for the dataset/asset, or Dataset object if no asset_key.

Example:
>>> ds_input = DatasetInput(...)
>>> twin = ds_input.load()
>>> # Use twin.mock for public mock data
>>> # Use twin.private for actual private data (requires authorization)
"""
try:
import beaver

ctx = context or beaver.ctx()
dataset = ctx.datasets[self.owner][self.dataset_name]

if self.asset_key:
return dataset[self.asset_key]
return dataset
except ImportError:
raise ImportError(
"beaver package is required for dataset inputs. "
"Install with: pip install biovault-beaver"
) from None


@dataclass
class Output:
"""A project output specification."""
Expand Down
Loading