Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitattributes

This file was deleted.

18 changes: 18 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: Python package

on: [push]

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: setup python for tox
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: install tox
run: python -m pip install tox tox-conda
- name: Test with tox
run: |
tox
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [1.0.0] 2023-05-19
### Changed
- GenBank parser will not optimize CDS blocks, so that CDS intervals with adjacent sites can be loaded as such.

## [0.19.0] 2022-10-21
### Added
- `AA_EXTENDED`, `AA_STRICT_GAPPED`, `AA_EXTENDED_GAPPED`, and `AA_STRICT_UNKNOWN` alphabets.
Expand Down Expand Up @@ -166,7 +170,7 @@ as child of the GeneInterval or FeatureCollectionInterval objects. Fix bug intro
## [0.7.0]
### Changed
- GenBank position-sorted parser can now handle CDS records that are not directly following a gene record.
- Refactor `Location`, `Parent` and `Sequence` to have base classes `AbstractLocation`, `AbstractParent` and `AbstractSequence` that are in the base of the `inscripta.biocantor.location` module. This greatly helps with resolving circular imports.
- Refactor `Location`, `Parent` and `Sequence` to have base classes `AbstractLocation`, `AbstractParent` and `AbstractSequence` that are in the base of the `biocantor.location` module. This greatly helps with resolving circular imports.
- Optimized checking `sequence` and `location` members to explicitly check for `None`. This avoids a call to `__len__`.
- `CompoundInterval._single_intervals` is now lazily evaluated, because it is expensive to generate many `SingleInterval` objects.
- `CompoundInterval` now stores the positions as two sorted integer lists.
Expand Down
21 changes: 0 additions & 21 deletions LICENSE.txt

This file was deleted.

12 changes: 6 additions & 6 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from pathlib import Path

from inscripta.biocantor.gene import CDSInterval
from inscripta.biocantor.io.genbank.parser import parse_genbank, ParsedAnnotationRecord
from inscripta.biocantor.io.gff3.parser import parse_standard_gff3
from inscripta.biocantor.location import Strand, SingleInterval
from inscripta.biocantor.parent import Parent, SequenceType
from inscripta.biocantor.sequence import Sequence, Alphabet
from biocantor.gene import CDSInterval
from biocantor.io.genbank.parser import parse_genbank, ParsedAnnotationRecord
from biocantor.io.gff3.parser import parse_standard_gff3
from biocantor.location import Strand, SingleInterval
from biocantor.parent import Parent, SequenceType
from biocantor.sequence import Sequence, Alphabet

DATA_DIR = Path(__file__).parent.parent / "tests/data"

Expand Down
5 changes: 4 additions & 1 deletion inscripta/biocantor/__init__.py → biocantor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.19.0"
__version__ = "1.0.0"

from abc import ABC, abstractmethod
from enum import Enum
Expand Down Expand Up @@ -322,6 +322,7 @@ def intersection(
match_strand: bool = True,
full_span: bool = False,
strict_parent_compare: bool = False,
optimize_blocks: bool = True,
) -> "AbstractLocation":
"""Returns a new Location representing the intersection of this Location with the other Location.
Returned Location, if nonempty, has the same Strand as this Location. This operation is commutative
Expand All @@ -338,6 +339,8 @@ def intersection(
If set to True, compare the full span of this Location to the full span of the other Location.
strict_parent_compare
Raise MismatchedParentException if parents do not match
optimize_blocks
Should the resulting blocks be optimized? Defaults to True.

"""

Expand Down
File renamed without changes.
File renamed without changes.
17 changes: 17 additions & 0 deletions biocantor/gene/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Special feature arithmetic operations for CDSs, codons and translation.

Container classes wrap locations to model genes, transcripts and generic genomic intervals.
"""

from biocantor.gene.biotype import Biotype # noqa F401
from biocantor.gene.cds_frame import CDSPhase, CDSFrame # noqa F401
from biocantor.gene.codon import Codon, TranslationTable # noqa F401
from biocantor.gene.cds import CDSInterval # noqa F401
from biocantor.gene.feature import FeatureInterval, FeatureIntervalCollection # noqa F401
from biocantor.gene.transcript import TranscriptInterval # noqa F401
from biocantor.gene.collections import ( # noqa F401
AnnotationCollection,
)
from biocantor.gene.gene import GeneInterval # noqa F401
from biocantor.gene.variants import VariantInterval, VariantIntervalCollection # noqa F401
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Biotypes are types of genes and transcripts, as defined by NCBI (INSDC) and Sequence Ontology.
"""

from inscripta.biocantor.util.enum import HasMemberMixin
from biocantor.util.enum import HasMemberMixin


Biotype = HasMemberMixin(
Expand Down
134 changes: 91 additions & 43 deletions inscripta/biocantor/gene/cds.py → biocantor/gene/cds.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
import warnings
from itertools import count, zip_longest
from typing import Iterator, List, Union, Optional, Dict, Hashable, Any, Set, Tuple, TYPE_CHECKING
from typing import Iterator, List, Union, Optional, Dict, Hashable, Any, Set, Tuple, TYPE_CHECKING, Type
from uuid import UUID

from methodtools import lru_cache

from inscripta.biocantor.exc import (
from biocantor.exc import (
InvalidCDSIntervalError,
NoSuchAncestorException,
LocationOverlapException,
MismatchedFrameException,
EmptyLocationException,
)
from inscripta.biocantor.gene.cds_frame import CDSPhase, CDSFrame
from inscripta.biocantor.gene.codon import Codon, TranslationTable
from inscripta.biocantor.gene.interval import AbstractFeatureInterval, QualifierValue
from inscripta.biocantor.io.bed import RGB, BED12
from inscripta.biocantor.io.gff3.constants import GFF_SOURCE, NULL_COLUMN, BioCantorFeatureTypes, BioCantorQualifiers
from inscripta.biocantor.io.gff3.rows import GFFAttributes, GFFRow
from inscripta.biocantor.location import Location, Strand, SingleInterval, CompoundInterval
from inscripta.biocantor.parent import Parent, SequenceType
from inscripta.biocantor.sequence import Sequence, Alphabet
from inscripta.biocantor.util.hashing import digest_object
from biocantor.gene.cds_frame import CDSPhase, CDSFrame
from biocantor.gene.codon import Codon, TranslationTable
from biocantor.gene.interval import AbstractFeatureInterval, QualifierValue
from biocantor.io.bed import RGB, BED12
from biocantor.io.gff3.constants import GFF_SOURCE, NULL_COLUMN, BioCantorFeatureTypes, BioCantorQualifiers
from biocantor.io.gff3.rows import GFFAttributes, GFFRow, GTFRow, GTFAttributes
from biocantor.location import Location, Strand, SingleInterval, CompoundInterval
from biocantor.parent import Parent, SequenceType
from biocantor.sequence import Sequence, Alphabet
from biocantor.util.hashing import digest_object

if TYPE_CHECKING:
from inscripta.biocantor.gene.variants import VariantIntervalCollection, VariantInterval
from biocantor.gene.variants import VariantIntervalCollection, VariantInterval


class CDSInterval(AbstractFeatureInterval):
Expand All @@ -51,7 +51,6 @@ def __init__(
guid: Optional[UUID] = None,
parent_or_seq_chunk_parent: Optional[Parent] = None,
):

self._location = self.initialize_location(cds_starts, cds_ends, strand, parent_or_seq_chunk_parent)
self._genomic_starts = cds_starts
self._genomic_ends = cds_ends
Expand Down Expand Up @@ -141,7 +140,6 @@ def chunk_relative_frames(self) -> List[CDSFrame]:
distance_from_start = fivep_phase

for genomic_exon in self._exon_iter(chunk_relative_exon=False):

# chromosome location has overlapping blocks merged, so that the intersection always has one block
# this is OK to do here since the original genomic intervals retain the overlapping information
if isinstance(self._chunk_relative_bounded_chromosome_location, SingleInterval):
Expand Down Expand Up @@ -277,7 +275,7 @@ def from_chunk_relative_location(

.. code-block:: python

from inscripta.biocantor.io.parser import seq_chunk_to_parent
from biocantor.io.parser import seq_chunk_to_parent
parent = seq_chunk_to_parent('AANAAATGGCGAGCACCTAACCCCCNCC', "NC_000913.3", 222213, 222241)
loc = SingleInterval(5, 20, Strand.PLUS, parent=parent)

Expand Down Expand Up @@ -322,35 +320,15 @@ def export_qualifiers(
qualifiers[key].add(val)
return qualifiers

def to_gff(
def _to_gff_or_gtf(
self,
parent: Optional[str] = None,
parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
chromosome_relative_coordinates: bool = True,
raise_on_reserved_attributes: Optional[bool] = True,
) -> Iterator[GFFRow]:
"""Writes a GFF format list of lists for this CDS.

The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files
are easier to work with if the children features have the qualifiers of their parents.

Args:
parent: ID of the Parent of this transcript.
parent_qualifiers: Directly pull qualifiers in from this dictionary.
chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
if there is not a ``sequence_chunk`` ancestor type.
raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present
in the qualifiers will lead to an exception and not a warning.

Yields:
:class:`~biocantor.io.gff3.rows.GFFRow`

Raises:
NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
``sequence_chunk`` ancestor type.
GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
"""

row_type: Union[Type[GFFRow], Type[GTFRow]] = GFFRow,
attribute_type: Union[Type[GFFAttributes], Type[GTFAttributes]] = GFFAttributes,
) -> Iterator[Union[GFFRow, GTFRow]]:
if not chromosome_relative_coordinates and not self.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
raise NoSuchAncestorException(
"Cannot export GFF in relative coordinates without a sequence_chunk ancestor."
Expand All @@ -369,14 +347,14 @@ def to_gff(

for i, block, frame in zip(count(1), cds_blocks, frames):
start, end = block
attributes = GFFAttributes(
attributes = attribute_type(
id=f"{cds_guid}-{i}",
qualifiers=qualifiers,
name=self.protein_id,
parent=parent,
raise_on_reserved_attributes=raise_on_reserved_attributes,
)
row = GFFRow(
row = row_type(
self.sequence_name,
GFF_SOURCE,
BioCantorFeatureTypes.CDS,
Expand All @@ -389,6 +367,77 @@ def to_gff(
)
yield row

def to_gff(
self,
parent: Optional[str] = None,
parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
chromosome_relative_coordinates: bool = True,
raise_on_reserved_attributes: Optional[bool] = True,
) -> Iterator[GFFRow]:
"""Writes a GFF format list of lists for this CDS.

The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files
are easier to work with if the children features have the qualifiers of their parents.

Args:
parent: ID of the Parent of this transcript.
parent_qualifiers: Directly pull qualifiers in from this dictionary.
chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
if there is not a ``sequence_chunk`` ancestor type.
raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present
in the qualifiers will lead to an exception and not a warning.

Yields:
:class:`~biocantor.io.gff3.rows.GFFRow`

Raises:
NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
``sequence_chunk`` ancestor type.
GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
"""
yield from self._to_gff_or_gtf(
parent,
parent_qualifiers,
chromosome_relative_coordinates,
raise_on_reserved_attributes,
GFFRow,
GFFAttributes,
)

def to_gtf(
self,
parent: Optional[str] = None,
parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
chromosome_relative_coordinates: bool = True,
) -> Iterator[GTFRow]:
"""Writes a GTF format list of lists for this CDS.

The additional qualifiers are used when writing a hierarchical relationship back to files. GTF files
are easier to work with if the children features have the qualifiers of their parents.

Args:
parent: ID of the Parent of this transcript.
parent_qualifiers: Directly pull qualifiers in from this dictionary.
chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
if there is not a ``sequence_chunk`` ancestor type.

Yields:
:class:`~biocantor.io.gff3.rows.GFFRow`

Raises:
NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
``sequence_chunk`` ancestor type.
GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
"""
yield from self._to_gff_or_gtf(
parent,
parent_qualifiers,
chromosome_relative_coordinates,
False,
GTFRow,
GTFAttributes,
)

@property
def has_canonical_start_codon(self) -> bool:
"""Does this CDS have a canonical valid start? Requires a sequence be associated."""
Expand Down Expand Up @@ -717,7 +766,6 @@ def _prepare_multi_exon_window_for_scan_codon_locations(
loc = self.chromosome_location
# zip_longest is used here to ensure that the two iterators are always actually in sync
for exon, frame in zip_longest(self._exon_iter(False), self._frame_iter(False)):

if exon is None or frame is None:
raise MismatchedFrameException("Frame iterator is not in sync with exon iterator")

Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions inscripta/biocantor/gene/codon.py → biocantor/gene/codon.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from enum import IntEnum
from typing import TYPE_CHECKING, List, Optional, Union

from inscripta.biocantor.constants import gencode, extended_gencode, aacodons
from biocantor.constants import gencode, extended_gencode, aacodons

if TYPE_CHECKING:
from inscripta.biocantor.sequence.sequence import Sequence
from biocantor.sequence.sequence import Sequence


class TranslationTable(IntEnum):
Expand Down
Loading