Source code for quantem.data.browser

"""DataBrowser — Interactive Jupyter widget for browsing quantem.data datasets."""

import json
import pathlib

import anywidget
import numpy as np
import traitlets

from quantem.data.registry import available, info, load
from quantem.data.schema import VALID_TECHNIQUES


[docs] class DataBrowser(anywidget.AnyWidget): """Interactive browser for quantem.data datasets on HF Hub. Displays a filterable table of available datasets. Click a dataset to see its metadata, then click Load to download it into memory. Parameters ---------- technique : str, optional Initial technique filter (e.g. ``"4dstem"``, ``"hrtem"``). Examples -------- >>> from quantem.data import DataBrowser >>> browser = DataBrowser() >>> browser # displays widget in notebook >>> # After selecting and loading a dataset in the UI: >>> browser.data.shape (256, 256) """ _esm = pathlib.Path(__file__).parent / "static" / "browser.js" # Python → JS: full catalog as JSON catalog_json = traitlets.Unicode("[]").tag(sync=True) # JS → Python: which dataset the user clicked selected_name = traitlets.Unicode("").tag(sync=True) # Python → JS: metadata for selected dataset selected_info_json = traitlets.Unicode("").tag(sync=True) # JS ↔ Python: technique filter technique_filter = traitlets.Unicode("").tag(sync=True) # Python → JS: loading indicator loading = traitlets.Bool(False).tag(sync=True) # Python → JS: name of loaded dataset loaded_name = traitlets.Unicode("").tag(sync=True) # JS → Python: trigger refresh _refresh_requested = traitlets.Bool(False).tag(sync=True) # JS → Python: trigger load _load_requested = traitlets.Bool(False).tag(sync=True)
[docs] def __init__(self, technique: str | None = None, **kwargs): super().__init__(**kwargs) self._data: np.ndarray | None = None self._metadata: dict | None = None if technique: self.technique_filter = technique self.observe(self._on_selected_name_change, names=["selected_name"]) self.observe(self._on_refresh_requested, names=["_refresh_requested"]) self.observe(self._on_load_requested, names=["_load_requested"]) self.refresh()
[docs] def refresh(self): """Re-fetch the dataset catalog from HF Hub.""" self.loading = True try: names = available() catalog = [] for name in names: try: meta = info(name) entry = { "name": name, "technique": meta.get("technique", ""), "shape": meta.get("data", {}).get("shape", []), "dtype": meta.get("data", {}).get("dtype", ""), "description": meta.get("description", ""), "size_mb": _estimate_size(meta), } catalog.append(entry) except Exception: catalog.append({ "name": name, "technique": "", "shape": [], "dtype": "", "description": "", "size_mb": 0, }) self.catalog_json = json.dumps(catalog) finally: self.loading = False
def _on_selected_name_change(self, change): name = change["new"] if not name: self.selected_info_json = "" return try: meta = info(name) meta.pop("_npy_path", None) self.selected_info_json = json.dumps(meta) except Exception: self.selected_info_json = "" def _on_refresh_requested(self, change): if change["new"]: self._refresh_requested = False self.refresh() def _on_load_requested(self, change): if change["new"]: self._load_requested = False name = self.selected_name if name: self._load_dataset(name) def _load_dataset(self, name: str): self.loading = True try: arr, meta = load(name, metadata=True) meta.pop("_npy_path", None) self._data = arr self._metadata = meta self.loaded_name = name except Exception as e: self._data = None self._metadata = None self.loaded_name = "" print(f"Failed to load {name}: {e}") finally: self.loading = False @property def data(self) -> np.ndarray | None: """The loaded dataset as a NumPy array, or None if nothing is loaded.""" return self._data @property def metadata(self) -> dict | None: """Metadata dict for the loaded dataset, or None.""" return self._metadata @property def techniques(self) -> list[str]: """Valid technique names.""" return list(VALID_TECHNIQUES)
[docs] def summary(self): """Print a human-readable summary of the browser state.""" catalog = json.loads(self.catalog_json) print(f"DataBrowser") print(f" Datasets: {len(catalog)}") if self.technique_filter: filtered = [d for d in catalog if d["technique"] == self.technique_filter] print(f" Filter: {self.technique_filter} ({len(filtered)} shown)") if self.loaded_name: print(f" Loaded: {self.loaded_name}") if self._data is not None: print(f" Shape: {self._data.shape}") print(f" Dtype: {self._data.dtype}")
[docs] def __repr__(self) -> str: catalog = json.loads(self.catalog_json) parts = [f"DataBrowser({len(catalog)} datasets"] if self.technique_filter: parts.append(f"filter={self.technique_filter!r}") if self.loaded_name: parts.append(f"loaded={self.loaded_name!r}") return ", ".join(parts) + ")"
def _estimate_size(meta: dict) -> float: """Estimate dataset size in MB from metadata shape + dtype.""" data = meta.get("data", {}) shape = data.get("shape", []) dtype = data.get("dtype", "float32") if not shape: return 0 try: nbytes = np.prod(shape) * np.dtype(dtype).itemsize return round(nbytes / (1024 * 1024), 2) except Exception: return 0