diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml index 1760f1d..405fee0 100644 --- a/.github/workflows/publish-pypi.yml +++ b/.github/workflows/publish-pypi.yml @@ -1,6 +1,3 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - name: Publish to PyPI on: @@ -19,10 +16,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.11 + - name: Set up Python 3.12 uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.12 - name: Install dependencies run: | @@ -33,6 +30,14 @@ jobs: run: | tox + - name: Build Project and Publish + run: | + python -m tox -e clean,build + + # This uses the trusted publisher workflow so no token is required. + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + - name: Build docs run: | tox -e docs @@ -45,11 +50,3 @@ jobs: branch: gh-pages # The branch the action should deploy to. folder: ./docs/_build/html clean: true # Automatically remove deleted files from the deploy branch - - - name: Build Project and Publish - run: | - python -m tox -e clean,build - - # This uses the trusted publisher workflow so no token is required. - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 90aa16a..e8ab6fa 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -1,33 +1,73 @@ -name: Run tests +name: Test the library on: push: - branches: [master] + branches: + - master # for legacy repos + - main pull_request: - branches: [master] + branches: + - master # for legacy repos + - main + workflow_dispatch: # Allow manually triggering the workflow + schedule: + # Run roughly every 15 days at 00:00 UTC + # (useful to check if updates on dependencies break the package) + - cron: "0 0 1,16 * *" + +permissions: + contents: read + +concurrency: + group: >- + ${{ github.workflow }}-${{ github.ref_type }}- + ${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true jobs: - build: - runs-on: ubuntu-latest + test: strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] - - name: Python ${{ matrix.python-version }} + python: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] + platform: + - ubuntu-latest + - macos-latest + - windows-latest + runs-on: ${{ matrix.platform }} + name: Python ${{ matrix.python }}, ${{ matrix.platform }} steps: - uses: actions/checkout@v4 - - name: Setup Python - uses: actions/setup-python@v5 + - uses: actions/setup-python@v5 + id: setup-python with: - python-version: ${{ matrix.python-version }} - cache: "pip" + python-version: ${{ matrix.python }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tox + pip install tox coverage - - name: Test with tox - run: | + - name: Run tests + run: >- + pipx run --python '${{ steps.setup-python.outputs.python-path }}' tox + -- -rFEx --durations 10 --color yes --cov --cov-branch --cov-report=xml # pytest args + + - name: Check for codecov token availability + id: codecov-check + shell: bash + run: | + if [ ${{ secrets.CODECOV_TOKEN }} != '' ]; then + echo "codecov=true" >> $GITHUB_OUTPUT; + else + echo "codecov=false" >> $GITHUB_OUTPUT; + fi + + - name: Upload coverage reports to Codecov with GitHub Action + uses: codecov/codecov-action@v5 + if: ${{ steps.codecov-check.outputs.codecov == 'true' }} + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + slug: ${{ github.repository }} + flags: ${{ matrix.platform }} - py${{ matrix.python }} diff --git a/CHANGELOG.md b/CHANGELOG.md index ac3d912..008c822 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## Version 0.7.0 + +- Major update to type hints throughout the module for better type safety and consistency. +- Fixed bug in slice operations where column indices might be incorrectly initialized. +- Added missing index validation in `get_row()` for integer row indices. Similar index validation in `remove_columns()` and `remove_rows()` for out-of-range indices. +- Accept a list of column values and column names to initialize a biocframe object. +- Implement empty, contains, head, tail, +- Coercions to list and `NamedList` from bioctuls. + ## Version 0.6.3 - Implement `remove_rows()`. - Implement `has_row()`. diff --git a/README.md b/README.md index faecc27..60d9bba 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,106 @@ pip install biocframe pip install biocframe[optional] ``` +## Quick Examples + +### Genomic Annotation Data + +Genomic data often requires storing coordinates, annotations, and metadata together: + +```python +# Gene annotation with nested structures +gene_annotations = BiocFrame({ + "gene_id": ["GENE1", "GENE2", "GENE3"], + "symbol": ["BRCA1", "TP53", "EGFR"], + "location": BiocFrame({ + "chromosome": ["chr17", "chr17", "chr7"], + "start": [43044295, 7668422, 55019017], + "end": [43125483, 7687550, 55211628], + "strand": ["-", "-", "+"], + }), + "transcripts": [ + ["NM_007294", "NM_007297", "NM_007300"], + ["NM_000546"], + ["NM_005228", "NM_201282"], + ], + "pathways": [ + ["DNA repair", "Cell cycle"], + ["Apoptosis", "Cell cycle", "DNA repair"], + ["Cell growth", "Signal transduction"], + ], +}, row_names=["ENSG00000012048", "ENSG00000141510", "ENSG00000146648"]) + +print(gene_annotations) +``` + +### Multi-Omics Data Integration + +When combining different types of omics data with varying structures: + +```python +# Multi-omics data with different measurement types +multi_omics = BiocFrame({ + "sample_id": ["S1", "S2", "S3"], + "rna_seq": np.array([ + [100, 200, 150], + [300, 250, 180], + [120, 220, 160], + ], dtype=np.float32), + "methylation": BiocFrame({ + "cg0001": [0.85, 0.92, 0.78], + "cg0002": [0.45, 0.38, 0.52], + "cg0003": [0.12, 0.15, 0.10], + }), + "clinical": BiocFrame({ + "age": [45, 52, 38], + "gender": ["M", "F", "F"], + "diagnosis": ["Type A", "Type B", "Type A"], + }), +}, column_data=BiocFrame({ + "data_type": ["identifier", "expression", "epigenetic", "clinical"], + "source": ["lab", "sequencer", "array", "EHR"], +})) + +print(multi_omics) +print("\nColumn metadata:") +print(multi_omics.get_column_data()) +``` + +### Hierarchical Data Structures + +For data with natural hierarchies (e.g., samples → patients → cohorts): + +```python +# Hierarchical clinical trial data +clinical_trial = BiocFrame({ + "patient_id": ["P001", "P002", "P003"], + "cohort": ["A", "A", "B"], + "samples": [ + BiocFrame({ + "sample_id": ["S001", "S002"], + "collection_date": ["2024-01-01", "2024-01-15"], + "vital_status": ["alive", "alive"], + }), + BiocFrame({ + "sample_id": ["S003", "S004", "S005"], + "collection_date": ["2024-01-02", "2024-01-16", "2024-01-30"], + "vital_status": ["alive", "alive", "deceased"], + }), + BiocFrame({ + "sample_id": ["S006"], + "collection_date": ["2024-01-03"], + "vital_status": ["alive"], + }), + ], +}, metadata={ + "trial_name": "PHASE_III_STUDY", + "start_date": "2024-01-01", + "status": "ongoing", +}) + +print(clinical_trial) +``` + ## Construction To construct a `BiocFrame` object, simply provide the data as a dictionary. diff --git a/docs/tutorial.md b/docs/tutorial.md index 5e56342..75ed4ed 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -14,6 +14,81 @@ This flexibility allows us to accept arbitrarily complex objects as columns, whi These classes follow a functional paradigm for accessing or setting properties, with further details discussed in [functional paradigm](https://biocpy.github.io/tutorial/chapters/philosophy.html) section. ::: +# When to Use `BiocFrame` + +`BiocFrame` is particularly well-suited for the following scenarios: + +## 1. **Bioconductor Interoperability** + +When working with R's Bioconductor ecosystem, `BiocFrame` provides seamless data exchange without type coercion issues that can occur with pandas. + +```{code-cell} +from biocframe import BiocFrame +import numpy as np + +# Preserve exact types for R interoperability +gene_data = BiocFrame({ + "gene_id": ["ENSG00000139618", "ENSG00000157764"], + "expression": np.array([2.5, 3.1], dtype=np.float32), + "p_value": np.array([0.001, 0.003], dtype=np.float64), +}) + +# Types are preserved exactly as provided +print(type(gene_data["expression"])) # +print(gene_data["expression"].dtype) # float32 +``` + +## 2. **Nested and Complex Data Structures** + +When your data contains nested structures, lists of varying lengths, or other complex objects that don't fit into traditional tabular formats. + +```{code-cell} +# Genomic ranges with nested information +genomic_data = BiocFrame({ + "gene_id": ["GENE1", "GENE2", "GENE3"], + "coordinates": BiocFrame({ + "chr": ["chr1", "chr2", "chr1"], + "start": [1000, 2000, 3000], + "end": [1500, 2500, 3500], + }), + "transcripts": [ + ["T1", "T2"], # GENE1 has 2 transcripts + ["T3"], # GENE2 has 1 transcript + ["T4", "T5", "T6"], # GENE3 has 3 transcripts + ], + "metadata": [ + {"source": "Ensembl", "version": 109}, + {"source": "NCBI", "version": 38}, + {"source": "Ensembl", "version": 109}, + ], +}) + +print(genomic_data) +``` + +## 3. **Functional Programming Style** + +When you prefer immutable operations that don't modify data in-place, making your code more predictable and easier to debug. + +```{code-cell} +# Chain operations without side effects +original = BiocFrame({ + "A": [1, 2, 3], + "B": [4, 5, 6], +}) + +# All operations return new objects +modified = (original + .set_column_names(["X", "Y"]) + .set_row_names(["row1", "row2", "row3"]) + .set_metadata({"source": "example"}) +) + +# Original is unchanged +print("Original column names:", original.column_names) +print("Modified column names:", modified.column_names) +``` + # Advantages of `BiocFrame` One of the core principles guiding the implementation of the `BiocFrame` class is "**_what you put is what you get_**". Unlike Pandas `DataFrame`, `BiocFrame` makes no assumptions about the types of the columns provided as input. Some key differences to highlight the advantages of using `BiocFrame` are especially in terms of modifications to column types and handling nested dataframes. @@ -146,6 +221,106 @@ The `row_names` parameter is analogous to index in the pandas world and should n - `metadata`: Additional metadata about the object, usually a dictionary. - `column_names`: If different from the keys in the `data`. If not provided, this is automatically extracted from the keys in the `data`. +# Example Use Cases + +## Use Case 1: Genomic Annotation Data + +Genomic data often requires storing coordinates, annotations, and metadata together: + +```{code-cell} +# Gene annotation with nested structures +gene_annotations = BiocFrame({ + "gene_id": ["GENE1", "GENE2", "GENE3"], + "symbol": ["BRCA1", "TP53", "EGFR"], + "location": BiocFrame({ + "chromosome": ["chr17", "chr17", "chr7"], + "start": [43044295, 7668422, 55019017], + "end": [43125483, 7687550, 55211628], + "strand": ["-", "-", "+"], + }), + "transcripts": [ + ["NM_007294", "NM_007297", "NM_007300"], + ["NM_000546"], + ["NM_005228", "NM_201282"], + ], + "pathways": [ + ["DNA repair", "Cell cycle"], + ["Apoptosis", "Cell cycle", "DNA repair"], + ["Cell growth", "Signal transduction"], + ], +}, row_names=["ENSG00000012048", "ENSG00000141510", "ENSG00000146648"]) + +print(gene_annotations) +``` + +## Use Case 2: Multi-Omics Data Integration + +When combining different types of omics data with varying structures: + +```{code-cell} +# Multi-omics data with different measurement types +multi_omics = BiocFrame({ + "sample_id": ["S1", "S2", "S3"], + "rna_seq": np.array([ + [100, 200, 150], + [300, 250, 180], + [120, 220, 160], + ], dtype=np.float32), + "methylation": BiocFrame({ + "cg0001": [0.85, 0.92, 0.78], + "cg0002": [0.45, 0.38, 0.52], + "cg0003": [0.12, 0.15, 0.10], + }), + "clinical": BiocFrame({ + "age": [45, 52, 38], + "gender": ["M", "F", "F"], + "diagnosis": ["Type A", "Type B", "Type A"], + }), +}, column_data=BiocFrame({ + "data_type": ["identifier", "expression", "epigenetic", "clinical"], + "source": ["lab", "sequencer", "array", "EHR"], +})) + +print(multi_omics) +print("\nColumn metadata:") +print(multi_omics.get_column_data()) +``` + +## Use Case 3: Hierarchical Data Structures + +For data with natural hierarchies (e.g., samples → patients → cohorts): + +```{code-cell} +# Hierarchical clinical trial data +clinical_trial = BiocFrame({ + "patient_id": ["P001", "P002", "P003"], + "cohort": ["A", "A", "B"], + "samples": [ + BiocFrame({ + "sample_id": ["S001", "S002"], + "collection_date": ["2024-01-01", "2024-01-15"], + "vital_status": ["alive", "alive"], + }), + BiocFrame({ + "sample_id": ["S003", "S004", "S005"], + "collection_date": ["2024-01-02", "2024-01-16", "2024-01-30"], + "vital_status": ["alive", "alive", "deceased"], + }), + BiocFrame({ + "sample_id": ["S006"], + "collection_date": ["2024-01-03"], + "vital_status": ["alive"], + }), + ], +}, metadata={ + "trial_name": "PHASE_III_STUDY", + "start_date": "2024-01-01", + "status": "ongoing", +}) + +print(clinical_trial) +``` + # With other `DataFrame` libraries # Pandas diff --git a/setup.cfg b/setup.cfg index 3ddc92b..733ece1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,7 +49,7 @@ python_requires = >=3.9 # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - biocutils>=0.1.4 + biocutils>=0.3.1.dev0 numpy [options.packages.find] diff --git a/src/biocframe/__init__.py b/src/biocframe/__init__.py index 6086c6a..ac15730 100644 --- a/src/biocframe/__init__.py +++ b/src/biocframe/__init__.py @@ -15,5 +15,5 @@ finally: del version, PackageNotFoundError -from .BiocFrame import BiocFrame, relaxed_combine_rows, merge, relaxed_combine_columns +from .frame import BiocFrame, relaxed_combine_rows, merge, relaxed_combine_columns from .io import from_pandas diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/frame.py similarity index 79% rename from src/biocframe/BiocFrame.py rename to src/biocframe/frame.py index 8f1833f..c93e09f 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/frame.py @@ -1,11 +1,17 @@ +from __future__ import annotations + from collections import OrderedDict, abc from copy import copy -from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Sequence, Tuple, Union from warnings import warn import biocutils as ut import numpy +if TYPE_CHECKING: + import pandas + import polars + __author__ = "Jayaram Kancherla, Aaron Lun, Kevin Yang" __copyright__ = "jkanche" __license__ = "MIT" @@ -17,8 +23,8 @@ def _guess_number_of_rows( number_of_rows: Optional[int], data: Dict[str, Any], - row_names: Optional[List[str]], -): + row_names: Optional[Union[Sequence[str], ut.Names]], +) -> int: if number_of_rows is not None: return number_of_rows if len(data) > 0: @@ -31,11 +37,8 @@ def _guess_number_of_rows( def _validate_rows( number_of_rows: int, data: Dict[str, Any], - row_names: Optional[List[str]], -) -> int: - if not isinstance(data, dict): - raise TypeError("`data` must be a dictionary.") - + row_names: Optional[Union[Sequence[str], ut.Names]], +) -> None: incorrect_len_keys = [] for k, v in data.items(): if number_of_rows != ut.get_height(v): @@ -56,10 +59,10 @@ def _validate_rows( def _validate_columns( - column_names: List[str], + column_names: ut.Names, data: Dict[str, Any], - column_data: Optional["BiocFrame"], -) -> Tuple[List[str], Dict[str, Any]]: + column_data: Optional[BiocFrame], +) -> None: if sorted(column_names) != sorted(data.keys()): raise ValueError("Mismatch between `column_names` and the keys of `data`.") @@ -72,25 +75,22 @@ def _validate_columns( class BiocFrameIter: - """An iterator to a :py:class:`~biocframe.BiocFrame.BiocFrame` object. - - Args: - obj (BiocFrame): Source object to iterate. - """ + """An iterator to a :py:class:`~biocframe.BiocFrame.BiocFrame` object.""" - def __init__(self, obj: "BiocFrame") -> None: + def __init__(self, obj: BiocFrame) -> None: """Initialize the iterator. Args: - obj (BiocFrame): source object to iterate. + obj: + Source object to iterate. """ self._bframe = obj self._current_index = 0 - def __iter__(self): + def __iter__(self) -> BiocFrameIter: return self - def __next__(self): + def __next__(self) -> Tuple[Optional[Union[ut.Names, str]], Dict[str, Any]]: if self._current_index < len(self._bframe): iter_row_index = self._bframe.row_names[self._current_index] if self._bframe.row_names is not None else None @@ -121,13 +121,13 @@ class BiocFrame: def __init__( self, - data: Mapping = None, + data: Optional[Union[Dict[str, Any], ut.NamedList, Sequence[Any]]] = None, number_of_rows: Optional[int] = None, - row_names: Optional[List] = None, - column_names: Optional[List[str]] = None, - column_data: Optional["BiocFrame"] = None, - metadata: Optional[dict] = None, - validate: bool = True, + row_names: Optional[Union[Sequence[str], ut.Names]] = None, + column_names: Optional[Union[Sequence[str], ut.Names]] = None, + column_data: Optional[BiocFrame] = None, + metadata: Optional[Union[Dict[str, Any], ut.NamedList]] = None, + _validate: bool = True, ) -> None: """Initialize a ``BiocFrame`` object from columns. @@ -141,6 +141,10 @@ def __init__( a :py:class:`~biocutils.NamedList` that can be coerced into a dictionary. + Alternatively, a sequence of columns may be provided. In this case, + ``column_names`` must be provided and must have the same length + as the sequence. + number_of_rows: Number of rows. If not specified, inferred from ``data``. This needs to be provided if ``data`` is empty and ``row_names`` are @@ -161,7 +165,7 @@ def __init__( metadata: Additional metadata. Defaults to an empty dictionary. - validate: + _validate: Internal use only. """ if data is None: @@ -174,11 +178,21 @@ def __init__( if not isinstance(v, list): # if its a scalar, make a list else corce to list data[k] = list(v) if isinstance(v, abc.Sequence) else [v] + elif isinstance(data, Sequence) and not isinstance(data, (str, dict)): + if column_names is None: + raise ValueError("`column_names` must be provided if `data` is a sequence.") + + if len(data) != len(column_names): + raise ValueError("Length of `data` and `column_names` must match.") + + data = dict(zip(column_names, data)) self._data = data + if row_names is not None and not isinstance(row_names, ut.Names): row_names = ut.Names(row_names) self._row_names = row_names + self._number_of_rows = int( _guess_number_of_rows( number_of_rows, @@ -188,24 +202,70 @@ def __init__( ) if column_names is None: - column_names = ut.Names(self._data.keys()) - elif not isinstance(column_names, ut.Names): - column_names = ut.Names(column_names) - self._column_names = column_names + self._column_names = ut.Names(self._data.keys()) + else: + self._column_names = column_names if isinstance(column_names, ut.Names) else ut.Names(column_names) self._metadata = {} if metadata is None else metadata self._column_data = column_data - if validate: + if _validate: _validate_rows(self._number_of_rows, self._data, self._row_names) _validate_columns(self._column_names, self._data, self._column_data) - def _define_output(self, in_place: bool = False) -> "BiocFrame": + def _define_output(self, in_place: bool = False) -> BiocFrame: if in_place is True: return self else: return self.__copy__() + def __eq__(self, other: Any) -> bool: + """Check if the current object is equal to another. + + Args: + other: + Object to compare with. + + Returns: + True if the objects are equal, False otherwise. + """ + if not isinstance(other, BiocFrame): + return False + + if self.shape != other.shape: + return False + + if self.row_names != other.row_names: + return False + + if self.column_names != other.column_names: + return False + + for col in self.column_names: + d1 = self.column(col) + d2 = other.column(col) + + if isinstance(d1, numpy.ndarray) or isinstance(d2, numpy.ndarray): + if not numpy.array_equal(d1, d2): + return False + else: + try: + if d1 != d2: + return False + except Exception: + # Fallback for other array-like objects (e.g. pandas Series) + # where bool(d1 == d2) is ambiguous. + if not numpy.array_equal(d1, d2): + return False + + if self.metadata != other.metadata: + return False + + if self.column_data != other.column_data: + return False + + return True + ################################# ######>> Shape and stuff <<###### ################################# @@ -225,10 +285,39 @@ def __len__(self) -> int: """ return self.shape[0] + @property + def empty(self) -> bool: + """Check if the object is empty. + + Returns: + True if the object has no rows, False otherwise. + """ + return self.is_empty() + + def is_empty(self) -> bool: + """Check if the object is empty. + + Returns: + True if the object has no rows, False otherwise. + """ + return self.shape[0] == 0 + def __iter__(self) -> BiocFrameIter: """Iterator over rows.""" return BiocFrameIter(self) + def __contains__(self, name: str) -> bool: + """Check if a column exists. + + Args: + name: + Name of the column. + + Returns: + True if the column exists, False otherwise. + """ + return self.has_column(name) + @property def dims(self) -> Tuple[int, int]: """Alias for :py:attr:`~shape`.""" @@ -332,14 +421,16 @@ def __str__(self) -> str: ########################### def get_row_names(self) -> Optional[ut.Names]: - """ + """Get row names. + Returns: List of row names, or None if no row names are available. """ return self._row_names - def set_row_names(self, names: Optional[List], in_place: bool = False) -> "BiocFrame": - """ + def set_row_names(self, names: Optional[Union[Sequence[str], ut.Names]], in_place: bool = False) -> BiocFrame: + """Set new row names. + Args: names: List of strings. This should have length equal to the @@ -373,7 +464,7 @@ def row_names(self) -> Optional[ut.Names]: return self.get_row_names() @row_names.setter - def row_names(self, names: Optional[List]): + def row_names(self, names: Optional[Union[Sequence[str], ut.Names]]) -> None: """Alias for :py:attr:`~set_row_names` with ``in_place = True``. As this mutates the original object, a warning is raised. @@ -390,24 +481,41 @@ def rownames(self) -> Optional[ut.Names]: return self.get_row_names() @rownames.setter - def rownames(self, names: list): + def rownames(self, names: Optional[Union[Sequence[str], ut.Names]]) -> None: """Alias for :py:attr:`~set_row_names` with ``in_place = True``, provided for back-compaibility only. As this mutates the original object, a warning is raised. """ - return self.set_row_names(names, in_place=True) + self.set_row_names(names, in_place=True) ###################### ######>> Data <<###### ###################### def get_data(self) -> Dict[str, Any]: - """ + """Get the underlying data. + Returns: Dictionary of columns and their values. """ return self._data + def to_dict(self) -> Dict[str, Any]: + """Alias for :py:meth:`~get_data`. + + Returns: + Dictionary of columns and their values. + """ + return self.get_data() + + def to_NamedList(self) -> ut.NamedList: + """Convert the ``BiocFrame`` to a :py:class:`~biocutils.NamedList`. + + Returns: + A ``NamedList`` containing the columns. + """ + return ut.NamedList([self._data[c] for c in self._column_names], names=self._column_names) + @property def data(self) -> Dict[str, Any]: """Alias for :py:attr:`~get_data`.""" @@ -418,14 +526,24 @@ def data(self) -> Dict[str, Any]: ############################## def get_column_names(self) -> ut.Names: - """ + """Get column names. + Returns: A list of column names. """ return self._column_names - def set_column_names(self, names: List[str], in_place: bool = False) -> "BiocFrame": + def get_columns(self) -> List[Any]: + """Get all columns as a list. + + Returns: + A list containing the data for each column. """ + return [self._data[c] for c in self._column_names] + + def set_column_names(self, names: Union[Sequence[str], ut.Names], in_place: bool = False) -> BiocFrame: + """Set new column names. + Args: names: List of unique strings, of length equal to the number of @@ -463,7 +581,7 @@ def column_names(self) -> ut.Names: return self.get_column_names() @column_names.setter - def column_names(self, names: List[str]): + def column_names(self, names: Union[Sequence[str], ut.Names]) -> None: """Alias for :py:attr:`~set_column_names` with ``in_place = True``. As this mutates the original object, a warning is raised. @@ -480,7 +598,7 @@ def colnames(self) -> ut.Names: return self.get_column_names() @colnames.setter - def colnames(self, names: ut.Names): + def colnames(self, names: Union[Sequence[str], ut.Names]) -> None: """Alias for :py:attr:`~set_column_names` with ``in_place = True``, provided for back-compatibility only. As this mutates the original object, a warning is raised. @@ -491,8 +609,9 @@ def colnames(self, names: ut.Names): ######>> Metadata <<###### ########################## - def get_column_data(self, with_names: bool = True) -> Union[None, "BiocFrame"]: - """ + def get_column_data(self, with_names: bool = True) -> Optional[BiocFrame]: + """Get column data. + Args: with_names: Whether to set the column names of this ``BiocFrame`` as @@ -508,12 +627,13 @@ def get_column_data(self, with_names: bool = True) -> Union[None, "BiocFrame"]: output = output.set_row_names(self._column_names) return output - def set_column_data(self, column_data: Union[None, "BiocFrame"], in_place: bool = False) -> "BiocFrame": - """ + def set_column_data(self, column_data: Optional[BiocFrame], in_place: bool = False) -> BiocFrame: + """Set new column data. + Args: column_data: New column data. This should either be a ``BiocFrame`` with the - numbero of rows equal to the number of columns in the current object, + number of rows equal to the number of columns in the current object, or None to remove existing column data. in_place: @@ -532,12 +652,12 @@ def set_column_data(self, column_data: Union[None, "BiocFrame"], in_place: bool return output @property - def column_data(self) -> Union[None, "BiocFrame"]: + def column_data(self) -> Optional[BiocFrame]: """Alias for :py:attr:`~get_column_data`.""" return self.get_column_data() @column_data.setter - def column_data(self, column_data: Union[None, "BiocFrame"]): + def column_data(self, column_data: Optional[BiocFrame]) -> None: """Alias for :py:attr:`~set_column_data` with ``in_place = True``. As this mutates the original object, a warning is raised. @@ -549,14 +669,16 @@ def column_data(self, column_data: Union[None, "BiocFrame"]): self.set_column_data(column_data, in_place=True) def get_metadata(self) -> dict: - """ + """Get the metadata. + Returns: Dictionary of metadata for this object. """ return self._metadata - def set_metadata(self, metadata: dict, in_place: bool = False) -> "BiocFrame": - """ + def set_metadata(self, metadata: Dict[str, Any], in_place: bool = False) -> BiocFrame: + """Set new metadata. + Args: metadata: New metadata for this object. @@ -575,12 +697,12 @@ def set_metadata(self, metadata: dict, in_place: bool = False) -> "BiocFrame": return output @property - def metadata(self) -> dict: + def metadata(self) -> Dict[str, Any]: """Alias for :py:attr:`~get_metadata`.""" return self.get_metadata() @metadata.setter - def metadata(self, metadata: dict): + def metadata(self, metadata: Dict[str, Any]) -> None: """Alias for :py:attr:`~set_metadata` with ``in_place = True``. As this mutates the original object, a warning is raised. @@ -596,17 +718,20 @@ def metadata(self, metadata: dict): ################################ def has_column(self, name: str) -> bool: - """ + """Whether a column with the specified ``name`` exists in this object. + Args: - name: Name of the column. + name: + Name of the column. Returns: - Whether a column with the specified ``name`` exists in this object. + True if the column exists, Otherwise False. """ return name in self.column_names def get_column(self, column: Union[str, int]) -> Any: - """ + """Get the contents of the specified column. + Args: column: Name of the column, which must exist in :py:attr:`~get_column_names`. @@ -620,8 +745,8 @@ def get_column(self, column: Union[str, int]) -> Any: if column < 0: raise IndexError("Index cannot be negative.") - if column > len(self._column_names): - raise IndexError("Index greater than the number of columns.") + if column >= len(self._column_names): + raise IndexError(f"Index {column} is out of range for {len(self._column_names)} columns.") return self._data[self._column_names[column]] elif isinstance(column, str): @@ -641,12 +766,14 @@ def column(self, column: Union[str, int]) -> Any: return self.get_column(column) def has_row(self, name: str) -> bool: - """ + """Whether a row with the specified ``name`` exists in this object. + Args: - name: Name of the row. + name: + Name of the row. Returns: - Whether a row with the specified ``name`` exists in this object. + True if the row exists, Otherwise False. """ if self.row_names is None: warn("No row names are defined", UserWarning) @@ -654,8 +781,9 @@ def has_row(self, name: str) -> bool: return name in self.row_names - def get_row(self, row: Union[str, int]) -> dict: - """ + def get_row(self, row: Union[str, int]) -> Dict[str, Any]: + """Get a specified row. + Args: row: Integer index of the row to access. @@ -671,18 +799,31 @@ def get_row(self, row: Union[str, int]) -> dict: if isinstance(row, str): if self._row_names is None: raise ValueError("No row names present to find row '" + row + "'.") - row = self._row_names.index(row) - if row < 0: + + row_idx = self._row_names.map(row) + if row_idx < 0: raise ValueError("Could not find row '" + row + "'.") - elif not isinstance(row, int): - raise TypeError("`row` must be either an integer index or row name.") + row = row_idx + else: + # must be an int + try: + if not isinstance(row, int): + row = int(row) # incase its a numpy int + except Exception: + raise TypeError("`row` must be either an integer index or row name.") + + if row < 0: + raise IndexError("Row index cannot be negative.") + + if row >= self.shape[0]: + raise IndexError(f"Row index {row} is out of range for {self.shape[0]} rows.") collected = {} for col in self._column_names: collected[col] = self._data[col][row] return collected - def row(self, row: Union[str, int]) -> dict: + def row(self, row: Union[str, int]) -> Dict[str, Any]: """Alias for :py:attr:`~get_row`, provided for back-compatibility only.""" warn( "Method 'row' is deprecated, use 'get_row' instead", @@ -694,11 +835,45 @@ def row(self, row: Union[str, int]) -> dict: ######>> Slicers <<###### ######################### + def head(self, n: int = 5) -> BiocFrame: + """Get the first `n` rows. + + Args: + n: + Number of rows to return. + + Returns: + A new ``BiocFrame`` object with the first `n` rows. + """ + if n < 0: + raise ValueError("n must be non-negative.") + + return self[:n, :] + + def tail(self, n: int = 5) -> BiocFrame: + """Get the last `n` rows. + + Args: + n: + Number of rows to return. + + Returns: + A new ``BiocFrame`` object with the last `n` rows. + """ + if n < 0: + raise ValueError("n must be non-negative.") + + rows = self.shape[0] + if n > rows: + n = rows + + return self[rows - n : rows, :] + def get_slice( self, - rows: Union[str, int, bool, Sequence], - columns: Union[str, int, bool, Sequence], - ) -> "BiocFrame": + rows: Union[str, int, bool, Sequence[Union[str, int, bool]], slice], + columns: Union[str, int, bool, Sequence[Union[str, int, bool]], slice], + ) -> BiocFrame: """Slice ``BiocFrame`` along the rows and/or columns, based on their indices or names. Args: @@ -725,6 +900,8 @@ def get_slice( if not (isinstance(columns, slice) and columns == slice(None)): new_column_indices, _ = ut.normalize_subscript(columns, len(new_column_names), new_column_names) new_column_names = ut.subset_sequence(new_column_names, new_column_indices) + else: + new_column_indices = slice(None) new_data = {} for col in new_column_names: @@ -755,10 +932,14 @@ def get_slice( column_names=new_column_names, metadata=self._metadata, column_data=column_data, - validate=False, + _validate=False, ) - def slice(self, rows: Sequence, columns: Sequence) -> "BiocFrame": + def slice( + self, + rows: Optional[Union[Sequence[Union[str, int, bool]], slice]], + columns: Optional[Union[Sequence[Union[str, int, bool]], slice]], + ) -> BiocFrame: """Alias for :py:attr:`~__getitem__`, for back-compatibility.""" if rows is None: rows = slice(None) @@ -766,7 +947,10 @@ def slice(self, rows: Sequence, columns: Sequence) -> "BiocFrame": columns = slice(None) return self.__getitem__((rows, columns)) - def __getitem__(self, args: Union[int, str, Sequence, tuple]) -> Union["BiocFrame", Any]: + def __getitem__( + self, + args: Union[int, str, Sequence[Union[str, int]], Tuple[Union[int, str, Sequence[Union[str, int]], slice], ...]], + ) -> Union[BiocFrame, Any]: """Wrapper around :py:attr:`~get_column` and :py:attr:`~get_slice` to obtain a slice of a ``BiocFrame`` or any of its columns. @@ -814,7 +998,11 @@ def __getitem__(self, args: Union[int, str, Sequence, tuple]) -> Union["BiocFram return self.get_slice(slice(None), args) - def __setitem__(self, args: Union[int, str, Sequence, tuple], value: "BiocFrame"): + def __setitem__( + self, + args: Union[int, str, Sequence[Union[str, int]], Tuple[Union[int, str, Sequence[Union[str, int]], slice], ...]], + value: Union[BiocFrame, Any], + ) -> None: """Wrapper around :py:attr:`~set_column` and :py:attr:`~set_slice` to modify a slice of a ``BiocFrame`` or any of its columns. As this modified the original object in place, a warning is raise. @@ -841,11 +1029,11 @@ def __setitem__(self, args: Union[int, str, Sequence, tuple], value: "BiocFrame" def set_slice( self, - rows: Union[int, str, bool, Sequence], - columns: Union[int, str, bool, Sequence], - value: "BiocFrame", + rows: Union[int, str, bool, Sequence[Union[int, str, bool]], slice], + columns: Union[int, str, bool, Sequence[Union[int, str, bool]], slice], + value: BiocFrame, in_place: bool = True, - ) -> "BiocFrame": + ) -> BiocFrame: """Replace a slice of the ``BiocFrame`` given the row and columns of the slice. Args: @@ -900,7 +1088,7 @@ def set_slice( ######>> Item setters <<###### ############################## - def __delitem__(self, name: str): + def __delitem__(self, name: str) -> None: """Alias for :py:attr:`~remove_column` with ``in_place = True``. As this mutates the original object, a warning is raised. @@ -911,7 +1099,7 @@ def __delitem__(self, name: str): ) self.remove_column(name, in_place=True) - def set_column(self, column: Union[int, str], value: Any, in_place: bool = False) -> "BiocFrame": + def set_column(self, column: Union[int, str], value: Any, in_place: bool = False) -> BiocFrame: """Modify an existing column or add a new column. This is a convenience wrapper around :py:attr:`~set_columns`. Args: @@ -932,7 +1120,7 @@ def set_column(self, column: Union[int, str], value: Any, in_place: bool = False """ return self.set_columns({column: value}, in_place=in_place) - def set_columns(self, columns: Dict[str, Any], in_place: bool = False) -> "BiocFrame": + def set_columns(self, columns: Dict[Union[str, int], Any], in_place: bool = False) -> BiocFrame: """Modify existing columns or add new columns. Args: @@ -955,6 +1143,12 @@ def set_columns(self, columns: Dict[str, Any], in_place: bool = False) -> "BiocF previous = len(output._column_names) for column, value in columns.items(): + if output.shape == (0, 0): + output._number_of_rows = ut.get_height(value) + + if output._row_names is not None and len(output._row_names) == 0 and output._number_of_rows > 0: + output._row_names = None + if ut.get_height(value) != output.shape[0]: raise ValueError( "Length of `value`, does not match the number of the rows," @@ -979,7 +1173,7 @@ def set_columns(self, columns: Dict[str, Any], in_place: bool = False) -> "BiocF return output - def remove_column(self, column: Union[int, str], in_place: bool = False) -> "BiocFrame": + def remove_column(self, column: Union[int, str], in_place: bool = False) -> BiocFrame: """Remove a column. This is a convenience wrapper around :py:attr:`~remove_columns`. Args: @@ -995,11 +1189,12 @@ def remove_column(self, column: Union[int, str], in_place: bool = False) -> "Bio """ return self.remove_columns([column], in_place=in_place) - def remove_columns(self, columns: Union[Sequence[Union[int, str]], slice], in_place: bool = False) -> "BiocFrame": + def remove_columns(self, columns: Union[Sequence[Union[int, str]], slice], in_place: bool = False) -> BiocFrame: """Remove any number of existing columns. Args: - columns: Column identifiers to remove. Must be either: + columns: + Column identifiers to remove. Must be either: - A sequence of strings matching column names - A sequence of integer indices - A slice object @@ -1027,9 +1222,13 @@ def remove_columns(self, columns: Union[Sequence[Union[int, str]], slice], in_pl killset = set() for name in columns: if isinstance(name, int): + if name < 0 or name >= len(output._column_names): + raise IndexError(f"Column index {name} is out of range.") name = output._column_names[name] + if name not in output._data: raise ValueError(f"Column '{name}' does not exist.") + del output._data[name] killset.add(name) @@ -1044,7 +1243,7 @@ def remove_columns(self, columns: Union[Sequence[Union[int, str]], slice], in_pl return output - def remove_row(self, row: Union[int, str], in_place: bool = False) -> "BiocFrame": + def remove_row(self, row: Union[int, str], in_place: bool = False) -> BiocFrame: """Remove a row. This is a convenience wrapper around :py:attr:`~remove_rows`. Args: @@ -1060,11 +1259,12 @@ def remove_row(self, row: Union[int, str], in_place: bool = False) -> "BiocFrame """ return self.remove_rows([row], in_place=in_place) - def remove_rows(self, rows: Union[Sequence[Union[int, str]], slice], in_place: bool = False) -> "BiocFrame": + def remove_rows(self, rows: Union[Sequence[Union[int, str]], slice], in_place: bool = False) -> BiocFrame: """Remove any number of existing rows. Args: - rows: Row identifiers to remove. Must be either: + rows: + Row identifiers to remove. Must be either: - A sequence of strings matching row names - A sequence of integer indices - A slice object @@ -1083,9 +1283,14 @@ def remove_rows(self, rows: Union[Sequence[Union[int, str]], slice], in_place: b if not in_place: output._data = copy(output._data) + _row_names = output._row_names + if output._row_names is None: + # raise ValueError("Cannot remove rows when row names are not defined.") + _row_names = range(len(output)) + if isinstance(rows, slice): - indices = range(*rows.indices(len(output._row_names))) - killset = {output._row_names[i] for i in indices} + indices = range(*rows.indices(len(_row_names))) + killset = {_row_names[i] for i in indices} else: # Check for homogeneous types types = set(type(x) for x in rows) @@ -1095,20 +1300,23 @@ def remove_rows(self, rows: Union[Sequence[Union[int, str]], slice], in_place: b killset = set() for name in rows: if isinstance(name, int): - name = output._row_names[name] - if name not in output._row_names: + if name < 0 or name >= len(_row_names): + raise IndexError(f"Row index {name} is out of range.") + name = _row_names[name] + + if name not in _row_names: raise ValueError(f"Row '{name}' does not exist.") killset.add(name) keep = [] - for i, row in enumerate(output._row_names): + for i, row in enumerate(_row_names): if row not in killset: keep.append(i) for col in output._data: output._data[col] = ut.subset_sequence(output._data[col], keep) - output._row_names = ut.subset_sequence(output._row_names, keep) + output._row_names = ut.subset_sequence(_row_names, keep) output._number_of_rows = int( _guess_number_of_rows( @@ -1124,7 +1332,7 @@ def remove_rows(self, rows: Union[Sequence[Union[int, str]], slice], in_place: b ######>> Copying <<###### ######################### - def __deepcopy__(self, memo=None, _nil=[]): + def __deepcopy__(self, memo: Optional[Dict[int, Any]] = None) -> BiocFrame: """ Returns: A deep copy of the current ``BiocFrame``. @@ -1155,7 +1363,7 @@ def __deepcopy__(self, memo=None, _nil=[]): column_data=_column_data_copy, ) - def __copy__(self): + def __copy__(self) -> BiocFrame: """ Returns: A shallow copy of the current ``BiocFrame``. @@ -1168,11 +1376,12 @@ def __copy__(self): column_names=self._column_names, metadata=self._metadata, column_data=self._column_data, + _validate=False, ) return new_instance - def copy(self): + def copy(self) -> BiocFrame: """Alias for :py:meth:`~__copy__`.""" return self.__copy__() @@ -1180,7 +1389,7 @@ def copy(self): ######>> split by <<###### ########################## - def split(self, column_name: str, only_indices: bool = False) -> Dict[str, Union["BiocFrame", List[int]]]: + def split(self, column_name: str, only_indices: bool = False) -> Dict[str, Union[BiocFrame, List[int]]]: """Split the object by a column. Args: @@ -1234,7 +1443,7 @@ def columns(self) -> ut.Names: """Alias for :py:attr:`~get_column_names`, provided for compatibility with **pandas**.""" return self.get_column_names() - def to_pandas(self): + def to_pandas(self) -> "pandas.DataFrame": """Convert the ``BiocFrame`` into a :py:class:`~pandas.DataFrame` object. Returns: @@ -1250,7 +1459,7 @@ def to_pandas(self): return DataFrame(data={}, index=range(self._number_of_rows)) @classmethod - def from_pandas(cls, input: "pandas.DataFrame") -> "BiocFrame": + def from_pandas(cls, input: "pandas.DataFrame") -> BiocFrame: """Create a ``BiocFrame`` from a :py:class:`~pandas.DataFrame` object. Args: @@ -1279,7 +1488,7 @@ def from_pandas(cls, input: "pandas.DataFrame") -> "BiocFrame": ################################ @classmethod - def from_polars(cls, input: "polars.DataFrame") -> "BiocFrame": + def from_polars(cls, input: "polars.DataFrame") -> BiocFrame: """Create a ``BiocFrame`` from a :py:class:`~polars.DataFrame` object. Args: @@ -1299,7 +1508,7 @@ def from_polars(cls, input: "polars.DataFrame") -> "BiocFrame": return cls(data=rdata) - def to_polars(self): + def to_polars(self) -> "polars.DataFrame": """Convert the ``BiocFrame`` into a :py:class:`~polars.DataFrame` object. Returns: @@ -1318,7 +1527,9 @@ def to_polars(self): ######>> Miscellaneous <<###### ############################### - def flatten(self, as_type: Literal["dict", "biocframe"] = "dict", delim: str = ".") -> "BiocFrame": + def flatten( + self, as_type: Literal["dict", "biocframe"] = "dict", delim: str = "." + ) -> Union[Dict[str, Any], BiocFrame]: """Flatten a nested BiocFrame object. Args: @@ -1332,7 +1543,7 @@ def flatten(self, as_type: Literal["dict", "biocframe"] = "dict", delim: str = " Returns: An object with the type specified by ``as_type`` argument. If ``as_type`` is `dict`, an additional column "rownames" is added if the object - contains rownames. + contains row names. """ if as_type not in ["dict", "biocframe"]: @@ -1360,7 +1571,7 @@ def flatten(self, as_type: Literal["dict", "biocframe"] = "dict", delim: str = " # TODO: very primitive implementation, needs very robust testing # TODO: implement in-place, view - def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame": + def __array_ufunc__(self, func: Any, method: str, *inputs: Any, **kwargs: Any) -> BiocFrame: """Interface for NumPy array methods. Note: This is a very primitive implementation and needs tests to support different types. @@ -1389,33 +1600,33 @@ def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame": ######>> Combine Ops <<###### ############################# - def combine(self, *other): + def combine(self, *other: BiocFrame) -> BiocFrame: """Wrapper around :py:func:`~relaxed_combine_rows`, provided for back-compatibility only.""" return relaxed_combine_rows(self, *other) - def relaxed_combine_rows(self, *other): + def relaxed_combine_rows(self, *other: BiocFrame) -> BiocFrame: """Wrapper around :py:func:`~relaxed_combine_rows`.""" return relaxed_combine_rows(self, *other) - def relaxed_combine_columns(self, *other): + def relaxed_combine_columns(self, *other: BiocFrame) -> BiocFrame: """Wrapper around :py:func:`~relaxed_combine_columns`.""" return relaxed_combine_columns(self, *other) - def combine_rows(self, *other): + def combine_rows(self, *other: BiocFrame) -> BiocFrame: """Wrapper around :py:func:`~biocutils.combine_rows`.""" return _combine_rows_bframes(self, *other) - def combine_columns(self, *other): + def combine_columns(self, *other: BiocFrame) -> BiocFrame: """Wrapper around :py:func:`~biocutils.combine_columns`.""" return _combine_cols_bframes(self, *other) def merge( self, - *other: Sequence["BiocFrame"], - by: Union[None, str, Sequence] = None, + *other: BiocFrame, + by: Union[None, str, int, Sequence[Union[None, str, int]]] = None, join: Literal["inner", "left", "right", "outer"] = "left", rename_duplicate_columns: bool = False, - ): + ) -> BiocFrame: """Wrapper around :py:func:`merge`.""" return merge( [self] + list(other), @@ -1429,7 +1640,22 @@ def merge( @ut.combine_rows.register(BiocFrame) -def _combine_rows_bframes(*x: BiocFrame): +def _combine_rows_bframes(*x: BiocFrame) -> BiocFrame: + """Combine multiple BiocFrame objects by row. + + Args: + *x: + One or more BiocFrame objects. + + Raises: + TypeError: + If all objects are not BiocFrame objects. + ValueError: + If all objects do not have the same number of columns. + + Returns: + A new BiocFrame object. + """ if not ut.is_list_of_type(x, BiocFrame): raise TypeError("All objects to combine must be BiocFrame objects.") @@ -1486,7 +1712,22 @@ def _combine_rows_bframes(*x: BiocFrame): @ut.combine_columns.register(BiocFrame) -def _combine_cols_bframes(*x: BiocFrame): +def _combine_cols_bframes(*x: BiocFrame) -> BiocFrame: + """Combine multiple BiocFrame objects by column. + + Args: + *x: + One or more BiocFrame objects. + + Raises: + TypeError: + If all objects are not BiocFrame objects. + ValueError: + If all objects do not have the same number of rows or have duplicate columns. + + Returns: + A new BiocFrame object. + """ if not ut.is_list_of_type(x, BiocFrame): raise TypeError("All objects to combine must be BiocFrame objects.") @@ -1534,17 +1775,47 @@ def _combine_cols_bframes(*x: BiocFrame): @ut.extract_row_names.register(BiocFrame) -def _rownames_bframe(x: BiocFrame): +def _rownames_bframe(x: BiocFrame) -> Optional[ut.Names]: + """Extract row names from a BiocFrame object. + + Args: + x: + A BiocFrame object. + + Returns: + Row names. + """ return x.get_row_names() @ut.extract_column_names.register(BiocFrame) -def _colnames_bframe(x: BiocFrame): +def _colnames_bframe(x: BiocFrame) -> ut.Names: + """Extract column names from a BiocFrame object. + + Args: + x: + A BiocFrame object. + + Returns: + Column names. + """ return x.get_column_names() @ut.show_as_cell.register(BiocFrame) def _show_as_cell_BiocFrame(x: BiocFrame, indices: Sequence[int]) -> List[str]: + """Show a BiocFrame as a cell. + + Args: + x: + A BiocFrame object. + + indices: + Indices to show. + + Returns: + A list of strings. + """ constructs = [] for i in indices: constructs.append([]) @@ -1554,14 +1825,29 @@ def _show_as_cell_BiocFrame(x: BiocFrame, indices: Sequence[int]) -> List[str]: for i, v in enumerate(col): constructs[i].append(v) - for i, x in enumerate(constructs): - constructs[i] = ":".join(x) + for i, z in enumerate(constructs): + constructs[i] = ":".join(z) return constructs @ut.assign_rows.register(BiocFrame) def _assign_rows_BiocFrame(x: BiocFrame, indices: Sequence[int], replacement: BiocFrame) -> BiocFrame: + """Assign rows to a BiocFrame object. + + Args: + x: + A BiocFrame object. + + indices: + Indices to assign. + + replacement: + A BiocFrame object to assign. + + Returns: + A new BiocFrame object. + """ return x.set_slice(indices, replacement.get_column_names(), replacement) @@ -1569,7 +1855,19 @@ def _assign_rows_BiocFrame(x: BiocFrame, indices: Sequence[int], replacement: Bi # Could turn this into a generic, if it was more useful elsewhere. -def _construct_missing(col, n): +def _construct_missing(col: Any, n: int) -> Any: + """Construct a missing value for a column. + + Args: + col: + A column. + + n: + Number of missing values to construct. + + Returns: + A missing value. + """ if isinstance(col, numpy.ndarray): return numpy.ma.array( numpy.zeros(n, dtype=col.dtype), @@ -1629,7 +1927,28 @@ def relaxed_combine_rows(*x: BiocFrame) -> BiocFrame: ############################ -def _normalize_merge_key_to_index(x, i, by): +def _normalize_merge_key_to_index(x: Sequence[BiocFrame], i: int, by: Union[None, str, int]) -> Optional[int]: + """Normalize a merge key to an index. + + Args: + x: + A sequence of BiocFrame objects. + + i: + Index of the object in the sequence. + + by: + A merge key. + + Raises: + ValueError: + If the merge key is invalid. + TypeError: + If the merge key is of an unknown type. + + Returns: + An index. + """ if by is None: if x[i]._row_names is None: raise ValueError("Row names required as key but are absent in object " + str(i) + ".") @@ -1651,7 +1970,22 @@ def _normalize_merge_key_to_index(x, i, by): raise TypeError("Unknown type '" + type(by).__name__ + "' for the 'by' argument.") -def _get_merge_key(x, i, by): +def _get_merge_key(x: Sequence[BiocFrame], i: int, by: List[Optional[int]]) -> Any: + """Get a merge key. + + Args: + x: + A sequence of BiocFrame objects. + + i: + Index of the object in the sequence. + + by: + A list of merge keys. + + Returns: + A merge key. + """ if by[i] is None: return x[i]._row_names else: @@ -1660,10 +1994,10 @@ def _get_merge_key(x, i, by): def merge( x: Sequence[BiocFrame], - by: Union[None, str, Sequence] = None, + by: Union[None, str, int, Sequence[Union[None, str, int]]] = None, join: Literal["inner", "left", "right", "outer"] = "left", rename_duplicate_columns: bool = False, -) -> "BiocFrame": +) -> BiocFrame: """Merge multiple :py:class:`~BiocFrame`` objects together by common columns or row names, yielding a combined object with a union of columns across all objects. @@ -1694,7 +2028,7 @@ def merge( raised instead. Returns: - BiocFrame: A BiocFrame containing the merged contents. + A BiocFrame containing the merged contents. If ``by = None``, the keys are stored in the row names. @@ -1736,6 +2070,11 @@ def merge( elif join == "right": noop = i == len(x) - 1 + keep = None + has_missing = 0 + reorg_keep = None + reorg_permute = None + if not noop: keep = ut.match(all_keys, _get_merge_key(x, i, by)) has_missing = (keep < 0).sum() @@ -1773,8 +2112,12 @@ def merge( elif on_key: new_data[y] = all_keys elif has_missing == 0: + if keep is None: + raise RuntimeError("Internal error: 'keep' is None when has_missing == 0.") new_data[y] = ut.subset(val, keep) else: + if reorg_keep is None or reorg_permute is None: + raise RuntimeError("Internal error: 'reorg_keep' or 'reorg_permute' is None when has_missing > 0.") retained = ut.subset(val, reorg_keep) combined = ut.combine(retained, _construct_missing(val, 1)) new_data[y] = ut.subset(combined, reorg_permute) @@ -1811,4 +2154,4 @@ def merge( @ut.relaxed_combine_columns.register(BiocFrame) def relaxed_combine_columns(*x: BiocFrame) -> BiocFrame: """Wrapper around :py:func:`~merge` that performs a left join on the row names.""" - return merge(x, join="left", by=None) + return merge(list(x), join="left", by=None) diff --git a/src/biocframe/io/from_pandas.py b/src/biocframe/io/from_pandas.py index e720f05..71f2d62 100644 --- a/src/biocframe/io/from_pandas.py +++ b/src/biocframe/io/from_pandas.py @@ -1,4 +1,11 @@ -from ..BiocFrame import BiocFrame +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pandas + +from ..frame import BiocFrame __author__ = "jkanche" __copyright__ = "jkanche" diff --git a/tests/test_coercions.py b/tests/test_coercions.py new file mode 100644 index 0000000..c3b4424 --- /dev/null +++ b/tests/test_coercions.py @@ -0,0 +1,34 @@ +import pytest +from biocframe import BiocFrame +import biocutils as ut + +def test_to_dict(): + obj = BiocFrame({"A": [1, 2], "B": [3, 4]}) + d = obj.to_dict() + + assert isinstance(d, dict) + assert d["A"] == [1, 2] + assert d["B"] == [3, 4] + assert d is obj.get_data() + +def test_to_NamedList(): + obj = BiocFrame({"A": [1, 2], "B": [3, 4]}) + + nl = obj.to_NamedList() + assert isinstance(nl, ut.NamedList) + assert len(nl) == 2 + assert nl.get_names().as_list() == ["A", "B"] + assert nl[0] == [1, 2] + + # Test order + obj = BiocFrame({"B": [3, 4], "A": [1, 2]}) + obj = obj.set_column_names(["B", "A"]) + + obj2 = BiocFrame({}, column_names=[]) + obj2["Z"] = [1] + obj2["X"] = [2] + + nl2 = obj2.to_NamedList() + assert nl2.get_names().as_list() == ["Z", "X"] + assert nl2[0] == [1] + assert nl2[1] == [2] diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py new file mode 100644 index 0000000..7d6ad94 --- /dev/null +++ b/tests/test_edge_cases.py @@ -0,0 +1,230 @@ +import numpy as np +import pytest +from biocframe import BiocFrame +from biocutils import Names + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_get_slice_with_slice_none(): + """Test that get_slice handles slice(None) correctly for columns.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + }, + column_data=BiocFrame({"meta": [1, 2]}), + ) + + result = bframe.get_slice(slice(0, 2), slice(None)) + assert result.shape == (2, 2) + assert result.column_data is not None + assert result.column_data.shape[0] == 2 + + +def test_remove_rows_without_row_names(): + """Test that remove_rows raises appropriate error when row_names is None.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + } + ) + + result = bframe.remove_rows([0, 1]) + assert result.shape == (1, 2) + + +def test_get_row_index_validation(): + """Test that get_row validates integer indices properly.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + } + ) + + with pytest.raises(IndexError, match="Row index cannot be negative"): + bframe.get_row(-1) + + with pytest.raises(IndexError, match="Row index 10 is out of range"): + bframe.get_row(10) + + result = bframe.get_row(0) + assert result == {"col1": 1, "col2": 4} + + result = bframe.get_row(np.int64(1)) + assert result == {"col1": 2, "col2": 5} + + +def test_get_column_index_validation(): + """Test that get_column validates integer indices properly.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + } + ) + + with pytest.raises(IndexError, match="Index cannot be negative"): + bframe.get_column(-1) + + with pytest.raises(IndexError, match="Index 10 is out of range"): + bframe.get_column(10) + + result = bframe.get_column(0) + assert result == [1, 2, 3] + + +def test_remove_columns_index_validation(): + """Test that remove_columns validates integer indices properly.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + "col3": [7, 8, 9], + } + ) + + with pytest.raises(IndexError, match="Column index -1 is out of range"): + bframe.remove_columns([-1]) + + with pytest.raises(IndexError, match="Column index 10 is out of range"): + bframe.remove_columns([10]) + + result = bframe.remove_columns([0, 2]) + assert result.shape == (3, 1) + assert result.column_names.as_list() == ["col2"] + + +def test_remove_rows_index_validation(): + """Test that remove_rows validates integer indices properly.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3, 4], + "col2": [5, 6, 7, 8], + }, + row_names=["row1", "row2", "row3", "row4"], + ) + + with pytest.raises(IndexError, match="Row index -1 is out of range"): + bframe.remove_rows([-1]) + + with pytest.raises(IndexError, match="Row index 10 is out of range"): + bframe.remove_rows([10]) + + result = bframe.remove_rows([0, 2]) + assert result.shape == (2, 2) + assert result.row_names.as_list() == ["row2", "row4"] + + +def test_merge_variable_scope(): + """Test that merge function handles variable scope correctly.""" + from biocframe import merge + + obj1 = BiocFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, row_names=["a", "b", "c"]) + obj2 = BiocFrame({"C": [7, 8]}, row_names=["b", "c"]) + + result = merge([obj1, obj2], by=None, join="left") + assert result.shape == (3, 3) + assert result.column("B") == [4, 5, 6] + assert result.column("C") == [None, 7, 8] + + result = merge([obj1, obj2], by=None, join="right") + assert result.shape == (2, 3) + assert result.column("B") == [5, 6] + assert result.column("C") == [7, 8] + + +def test_get_row_with_string_and_numpy_int(): + """Test that get_row handles numpy integer types correctly.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + }, + row_names=["row1", "row2", "row3"], + ) + + result = bframe.get_row("row2") + assert result == {"col1": 2, "col2": 5} + + result = bframe.get_row(np.int64(1)) + assert result == {"col1": 2, "col2": 5} + + result = bframe.get_row(np.int32(0)) + assert result == {"col1": 1, "col2": 4} + + +def test_empty_biocframe_operations(): + """Test operations on empty BiocFrame objects.""" + empty = BiocFrame({}) + assert empty.shape == (0, 0) + + empty = BiocFrame({}, number_of_rows=10) + assert empty.shape == (10, 0) + assert len(empty.column_names) == 0 + + sliced = empty[0:5, :] + assert sliced.shape == (5, 0) + + +def test_column_names_assignment_edge_cases(): + """Test edge cases in column_names assignment.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + } + ) + + new_names = Names(["foo", "bar"]) + result = bframe.set_column_names(new_names) + assert result.column_names.as_list() == ["foo", "bar"] + + result = bframe.set_column_names(["baz", "qux"]) + assert result.column_names.as_list() == ["baz", "qux"] + + +def test_row_names_with_none_values(): + """Test that row names cannot contain None values.""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + } + ) + + with pytest.raises(ValueError, match="cannot contain None values"): + bframe.set_row_names(["row1", None, "row3"]) + + +def test_get_slice_with_all_slice_none(): + """Test get_slice when both rows and columns are slice(None).""" + bframe = BiocFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + }, + column_data=BiocFrame({"meta": [1, 2]}), + ) + + result = bframe.get_slice(slice(None), slice(None)) + assert result.shape == bframe.shape + assert result.column_data is not None + assert result.column_data.shape == bframe.column_data.shape + + +def test_merge_with_missing_keys(): + """Test merge function with missing keys in join operations.""" + from biocframe import merge + + obj1 = BiocFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, row_names=["a", "b", "c"]) + obj2 = BiocFrame({"C": [7, 8]}, row_names=["d", "e"]) + + result = merge([obj1, obj2], by=None, join="outer") + assert len(result) == 5 + assert result.column("B") == [4, 5, 6, None, None] + assert result.column("C") == [None, None, None, 7, 8] diff --git a/tests/test_equality.py b/tests/test_equality.py new file mode 100644 index 0000000..9f39541 --- /dev/null +++ b/tests/test_equality.py @@ -0,0 +1,96 @@ +import pytest +import numpy as np +from biocframe import BiocFrame + +def test_equality_basics(): + obj1 = BiocFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}) + obj2 = BiocFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}) + assert obj1 == obj2 + + # Different data + obj3 = BiocFrame({"A": [1, 2, 4], "B": ["x", "y", "z"]}) + assert obj1 != obj3 + + # Different column names + obj4 = BiocFrame({"A": [1, 2, 3], "C": ["x", "y", "z"]}) + assert obj1 != obj4 + + # Different dims + obj5 = BiocFrame({"A": [1, 2]}) + assert obj1 != obj5 + +def test_equality_metadata_columndata(): + obj1 = BiocFrame( + {"A": [1, 2]}, + metadata={"m": 1}, + column_data=BiocFrame({"annot": [10]}, row_names=["A"]) + ) + obj2 = BiocFrame( + {"A": [1, 2]}, + metadata={"m": 1}, + column_data=BiocFrame({"annot": [10]}, row_names=["A"]) + ) + assert obj1 == obj2 + + # Metadata mismatch + obj3 = obj1.copy() + obj3.metadata = {"m": 2} + assert obj1 != obj3 + + # Column data mismatch + obj4 = obj1.copy() + obj4.column_data = BiocFrame({"annot": [20]}, row_names=["A"]) + assert obj1 != obj4 + +def test_equality_nested(): + obj1 = BiocFrame({ + "A": [1], + "nested": BiocFrame({"B": [2]}) + }) + obj2 = BiocFrame({ + "A": [1], + "nested": BiocFrame({"B": [2]}) + }) + assert obj1 == obj2 + + obj3 = BiocFrame({ + "A": [1], + "nested": BiocFrame({"B": [3]}) + }) + assert obj1 != obj3 + +def test_equality_numpy(): + obj1 = BiocFrame({"A": np.array([1, 2, 3])}) + obj2 = BiocFrame({"A": np.array([1, 2, 3])}) + assert obj1 == obj2 + + obj3 = BiocFrame({"A": np.array([1, 2, 4])}) + assert obj1 != obj3 + +def test_equality_pandas(): + try: + import pandas as pd + except ImportError: + pytest.skip("pandas not installed") + + obj1 = BiocFrame({"A": pd.Series([1, 2, 3])}) + obj2 = BiocFrame({"A": pd.Series([1, 2, 3])}) + + # This triggers the exception handling block for ambiguous truth values + assert obj1 == obj2 + + obj3 = BiocFrame({"A": pd.Series([1, 2, 4])}) + assert obj1 != obj3 + +def test_equality_polars(): + try: + import polars as pl + except ImportError: + pytest.skip("polars not installed") + + obj1 = BiocFrame({"A": pl.Series([1, 2, 3])}) + obj2 = BiocFrame({"A": pl.Series([1, 2, 3])}) + assert obj1 == obj2 + + obj3 = BiocFrame({"A": pl.Series([1, 2, 4])}) + assert obj1 != obj3 diff --git a/tests/test_initialize.py b/tests/test_initialize.py index 9d2bc60..ecce83b 100644 --- a/tests/test_initialize.py +++ b/tests/test_initialize.py @@ -173,3 +173,27 @@ def test_NamedList(): assert isinstance(frame, BiocFrame) assert frame.shape == (1, 4) assert list(frame.get_column_names()) == ["A", "B", "C", "D"] + +def test_initialize_from_sequence(): + data = [ + [1, 2, 3], # Column 1 + ["a", "b", "c"], # Column 2 + ] + col_names = ["id", "val"] + + bframe = BiocFrame(data, column_names=col_names) + + assert bframe is not None + assert bframe.shape == (3, 2) + assert bframe.column_names.as_list() == ["id", "val"] + assert bframe.column("id") == [1, 2, 3] + assert bframe.column("val") == ["a", "b", "c"] + +def test_initialize_from_sequence_errors(): + data = [[1, 2], [3, 4]] + + with pytest.raises(ValueError, match="`column_names` must be provided"): + BiocFrame(data) + + with pytest.raises(ValueError, match="Length of `data` and `column_names` must match"): + BiocFrame(data, column_names=["A"]) diff --git a/tests/test_methods.py b/tests/test_methods.py index 43f3a1f..18b6397 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -1,7 +1,7 @@ import numpy as np import pytest import pandas as pd -from biocframe.BiocFrame import BiocFrame +from biocframe.frame import BiocFrame from biocutils import Factor, Names import biocutils as ut @@ -702,3 +702,68 @@ def test_bframe_split(): assert isinstance(split_frame, dict) assert len(split_frame) == 2 assert len(split_frame["b"]) == 2 + +def test_get_columns(): + obj = { + "A": [1, 2, 3], + "B": ["x", "y", "z"] + } + bframe = BiocFrame(obj) + + cols = bframe.get_columns() + + assert isinstance(cols, list) + assert len(cols) == 2 + assert cols[0] == [1, 2, 3] + assert cols[1] == ["x", "y", "z"] + +def test_contains_dunder(): + obj = {"A": [1], "B": [2]} + bframe = BiocFrame(obj) + + assert "A" in bframe + assert "B" in bframe + assert "C" not in bframe + +def test_empty_property(): + empty_bf = BiocFrame({}, number_of_rows=0) + assert empty_bf.empty is True + + full_bf = BiocFrame({"A": [1, 2]}) + assert full_bf.empty is False + + cols_bf = BiocFrame({"A": [], "B": []}) + assert cols_bf.empty is True + +def test_head_tail(): + data = { + "val": list(range(10)) + } + bframe = BiocFrame(data) + + h = bframe.head(3) + assert h.shape == (3, 1) + assert h.column("val") == [0, 1, 2] + + h_default = bframe.head() + assert h_default.shape == (5, 1) + assert h_default.column("val") == [0, 1, 2, 3, 4] + + h_large = bframe.head(20) + assert h_large.shape == (10, 1) + + t = bframe.tail(3) + assert t.shape == (3, 1) + assert t.column("val") == [7, 8, 9] + + t_large = bframe.tail(20) + assert t_large.shape == (10, 1) + +def test_head_tail_errors(): + bframe = BiocFrame({"A": [1, 2, 3]}) + + with pytest.raises(ValueError): + bframe.head(-1) + + with pytest.raises(ValueError): + bframe.tail(-1) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 32cf8f8..c49750d 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -1,7 +1,7 @@ import numpy as np import pytest import pandas as pd -from biocframe.BiocFrame import BiocFrame +from biocframe.frame import BiocFrame from biocutils import Factor __author__ = "jkanche" diff --git a/tests/test_readme.py b/tests/test_readme.py index 067db10..a8356d0 100644 --- a/tests/test_readme.py +++ b/tests/test_readme.py @@ -1,6 +1,6 @@ from random import random -from biocframe.BiocFrame import BiocFrame +from biocframe.frame import BiocFrame __author__ = "jkanche" __copyright__ = "jkanche"