InscriptaLabs · ifiddes · Apr 19, 2023 · May 19, 2023 · May 19, 2023 · May 19, 2023
diff --git a/.gitattributes b/.gitattributes
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,18 @@
+name: Python package
+
+on: [push]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: setup python for tox
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: install tox
+        run: python -m pip install tox tox-conda
+      - name: Test with tox
+        run: |
+          tox
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [1.0.0] 2023-05-19
+### Changed
+- GenBank parser will not optimize CDS blocks, so that CDS intervals with adjacent sites can be loaded as such.
+
 ## [0.19.0] 2022-10-21
 ### Added
 - `AA_EXTENDED`, `AA_STRICT_GAPPED`, `AA_EXTENDED_GAPPED`, and `AA_STRICT_UNKNOWN` alphabets.
@@ -166,7 +170,7 @@ as child of the GeneInterval or FeatureCollectionInterval objects. Fix bug intro
 ## [0.7.0]
 ### Changed
 - GenBank position-sorted parser can now handle CDS records that are not directly following a gene record.
-- Refactor `Location`, `Parent` and `Sequence` to have base classes `AbstractLocation`, `AbstractParent` and `AbstractSequence` that are in the base of the `inscripta.biocantor.location` module. This greatly helps with resolving circular imports.
+- Refactor `Location`, `Parent` and `Sequence` to have base classes `AbstractLocation`, `AbstractParent` and `AbstractSequence` that are in the base of the `biocantor.location` module. This greatly helps with resolving circular imports.
 - Optimized checking `sequence` and `location` members to explicitly check for `None`. This avoids a call to `__len__`.
 - `CompoundInterval._single_intervals` is now lazily evaluated, because it is expensive to generate many `SingleInterval` objects.
 - `CompoundInterval` now stores the positions as two sorted integer lists.

diff --git a/LICENSE.txt b/LICENSE.txt
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -1,11 +1,11 @@
 from pathlib import Path
 
-from inscripta.biocantor.gene import CDSInterval
-from inscripta.biocantor.io.genbank.parser import parse_genbank, ParsedAnnotationRecord
-from inscripta.biocantor.io.gff3.parser import parse_standard_gff3
-from inscripta.biocantor.location import Strand, SingleInterval
-from inscripta.biocantor.parent import Parent, SequenceType
-from inscripta.biocantor.sequence import Sequence, Alphabet
+from biocantor.gene import CDSInterval
+from biocantor.io.genbank.parser import parse_genbank, ParsedAnnotationRecord
+from biocantor.io.gff3.parser import parse_standard_gff3
+from biocantor.location import Strand, SingleInterval
+from biocantor.parent import Parent, SequenceType
+from biocantor.sequence import Sequence, Alphabet
 
 DATA_DIR = Path(__file__).parent.parent / "tests/data"
 

diff --git a/inscripta/biocantor/__init__.py → biocantor/__init__.py b/inscripta/biocantor/__init__.py → biocantor/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.19.0"
+__version__ = "1.0.0"
 
 from abc import ABC, abstractmethod
 from enum import Enum
@@ -322,6 +322,7 @@ def intersection(
         match_strand: bool = True,
         full_span: bool = False,
         strict_parent_compare: bool = False,
+        optimize_blocks: bool = True,
     ) -> "AbstractLocation":
         """Returns a new Location representing the intersection of this Location with the other Location.
         Returned Location, if nonempty, has the same Strand as this Location. This operation is commutative
@@ -338,6 +339,8 @@ def intersection(
             If set to True, compare the full span of this Location to the full span of the other Location.
         strict_parent_compare
             Raise MismatchedParentException if parents do not match
+        optimize_blocks
+            Should the resulting blocks be optimized? Defaults to True.
 
         """
 

diff --git a/inscripta/biocantor/constants.py → biocantor/constants.py b/inscripta/biocantor/constants.py → biocantor/constants.py
diff --git a/inscripta/biocantor/exc.py → biocantor/exc.py b/inscripta/biocantor/exc.py → biocantor/exc.py
diff --git a/biocantor/gene/__init__.py b/biocantor/gene/__init__.py
@@ -0,0 +1,17 @@
+"""
+Special feature arithmetic operations for CDSs, codons and translation.
+
+Container classes wrap locations to model genes, transcripts and generic genomic intervals.
+"""
+
+from biocantor.gene.biotype import Biotype  # noqa F401
+from biocantor.gene.cds_frame import CDSPhase, CDSFrame  # noqa F401
+from biocantor.gene.codon import Codon, TranslationTable  # noqa F401
+from biocantor.gene.cds import CDSInterval  # noqa F401
+from biocantor.gene.feature import FeatureInterval, FeatureIntervalCollection  # noqa F401
+from biocantor.gene.transcript import TranscriptInterval  # noqa F401
+from biocantor.gene.collections import (  # noqa F401
+    AnnotationCollection,
+)
+from biocantor.gene.gene import GeneInterval  # noqa F401
+from biocantor.gene.variants import VariantInterval, VariantIntervalCollection  # noqa F401
diff --git a/inscripta/biocantor/gene/biotype.py → biocantor/gene/biotype.py b/inscripta/biocantor/gene/biotype.py → biocantor/gene/biotype.py
@@ -2,7 +2,7 @@
 Biotypes are types of genes and transcripts, as defined by NCBI (INSDC) and Sequence Ontology.
 """
 
-from inscripta.biocantor.util.enum import HasMemberMixin
+from biocantor.util.enum import HasMemberMixin
 
 
 Biotype = HasMemberMixin(

diff --git a/inscripta/biocantor/gene/cds.py → biocantor/gene/cds.py b/inscripta/biocantor/gene/cds.py → biocantor/gene/cds.py
@@ -1,30 +1,30 @@
 import warnings
 from itertools import count, zip_longest
-from typing import Iterator, List, Union, Optional, Dict, Hashable, Any, Set, Tuple, TYPE_CHECKING
+from typing import Iterator, List, Union, Optional, Dict, Hashable, Any, Set, Tuple, TYPE_CHECKING, Type
 from uuid import UUID
 
 from methodtools import lru_cache
 
-from inscripta.biocantor.exc import (
+from biocantor.exc import (
     InvalidCDSIntervalError,
     NoSuchAncestorException,
     LocationOverlapException,
     MismatchedFrameException,
     EmptyLocationException,
 )
-from inscripta.biocantor.gene.cds_frame import CDSPhase, CDSFrame
-from inscripta.biocantor.gene.codon import Codon, TranslationTable
-from inscripta.biocantor.gene.interval import AbstractFeatureInterval, QualifierValue
-from inscripta.biocantor.io.bed import RGB, BED12
-from inscripta.biocantor.io.gff3.constants import GFF_SOURCE, NULL_COLUMN, BioCantorFeatureTypes, BioCantorQualifiers
-from inscripta.biocantor.io.gff3.rows import GFFAttributes, GFFRow
-from inscripta.biocantor.location import Location, Strand, SingleInterval, CompoundInterval
-from inscripta.biocantor.parent import Parent, SequenceType
-from inscripta.biocantor.sequence import Sequence, Alphabet
-from inscripta.biocantor.util.hashing import digest_object
+from biocantor.gene.cds_frame import CDSPhase, CDSFrame
+from biocantor.gene.codon import Codon, TranslationTable
+from biocantor.gene.interval import AbstractFeatureInterval, QualifierValue
+from biocantor.io.bed import RGB, BED12
+from biocantor.io.gff3.constants import GFF_SOURCE, NULL_COLUMN, BioCantorFeatureTypes, BioCantorQualifiers
+from biocantor.io.gff3.rows import GFFAttributes, GFFRow, GTFRow, GTFAttributes
+from biocantor.location import Location, Strand, SingleInterval, CompoundInterval
+from biocantor.parent import Parent, SequenceType
+from biocantor.sequence import Sequence, Alphabet
+from biocantor.util.hashing import digest_object
 
 if TYPE_CHECKING:
-    from inscripta.biocantor.gene.variants import VariantIntervalCollection, VariantInterval
+    from biocantor.gene.variants import VariantIntervalCollection, VariantInterval
 
 
 class CDSInterval(AbstractFeatureInterval):
@@ -51,7 +51,6 @@ def __init__(
         guid: Optional[UUID] = None,
         parent_or_seq_chunk_parent: Optional[Parent] = None,
     ):
-
         self._location = self.initialize_location(cds_starts, cds_ends, strand, parent_or_seq_chunk_parent)
         self._genomic_starts = cds_starts
         self._genomic_ends = cds_ends
@@ -141,7 +140,6 @@ def chunk_relative_frames(self) -> List[CDSFrame]:
         distance_from_start = fivep_phase
 
         for genomic_exon in self._exon_iter(chunk_relative_exon=False):
-
             # chromosome location has overlapping blocks merged, so that the intersection always has one block
             # this is OK to do here since the original genomic intervals retain the overlapping information
             if isinstance(self._chunk_relative_bounded_chromosome_location, SingleInterval):
@@ -277,7 +275,7 @@ def from_chunk_relative_location(
 
         .. code-block:: python
 
-            from inscripta.biocantor.io.parser import seq_chunk_to_parent
+            from biocantor.io.parser import seq_chunk_to_parent
             parent = seq_chunk_to_parent('AANAAATGGCGAGCACCTAACCCCCNCC', "NC_000913.3", 222213, 222241)
             loc = SingleInterval(5, 20, Strand.PLUS, parent=parent)
 
@@ -322,35 +320,15 @@ def export_qualifiers(
             qualifiers[key].add(val)
         return qualifiers
 
-    def to_gff(
+    def _to_gff_or_gtf(
         self,
         parent: Optional[str] = None,
         parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
         chromosome_relative_coordinates: bool = True,
         raise_on_reserved_attributes: Optional[bool] = True,
-    ) -> Iterator[GFFRow]:
-        """Writes a GFF format list of lists for this CDS.
-
-        The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files
-        are easier to work with if the children features have the qualifiers of their parents.
-
-        Args:
-            parent: ID of the Parent of this transcript.
-            parent_qualifiers: Directly pull qualifiers in from this dictionary.
-            chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
-                if there is not a ``sequence_chunk`` ancestor type.
-            raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present
-                in the qualifiers will lead to an exception and not a warning.
-
-        Yields:
-            :class:`~biocantor.io.gff3.rows.GFFRow`
-
-        Raises:
-            NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
-            ``sequence_chunk`` ancestor type.
-            GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
-        """
-
+        row_type: Union[Type[GFFRow], Type[GTFRow]] = GFFRow,
+        attribute_type: Union[Type[GFFAttributes], Type[GTFAttributes]] = GFFAttributes,
+    ) -> Iterator[Union[GFFRow, GTFRow]]:
         if not chromosome_relative_coordinates and not self.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
             raise NoSuchAncestorException(
                 "Cannot export GFF in relative coordinates without a sequence_chunk ancestor."
@@ -369,14 +347,14 @@ def to_gff(
 
         for i, block, frame in zip(count(1), cds_blocks, frames):
             start, end = block
-            attributes = GFFAttributes(
+            attributes = attribute_type(
                 id=f"{cds_guid}-{i}",
                 qualifiers=qualifiers,
                 name=self.protein_id,
                 parent=parent,
                 raise_on_reserved_attributes=raise_on_reserved_attributes,
             )
-            row = GFFRow(
+            row = row_type(
                 self.sequence_name,
                 GFF_SOURCE,
                 BioCantorFeatureTypes.CDS,
@@ -389,6 +367,77 @@ def to_gff(
             )
             yield row
 
+    def to_gff(
+        self,
+        parent: Optional[str] = None,
+        parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
+        chromosome_relative_coordinates: bool = True,
+        raise_on_reserved_attributes: Optional[bool] = True,
+    ) -> Iterator[GFFRow]:
+        """Writes a GFF format list of lists for this CDS.
+
+        The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files
+        are easier to work with if the children features have the qualifiers of their parents.
+
+        Args:
+            parent: ID of the Parent of this transcript.
+            parent_qualifiers: Directly pull qualifiers in from this dictionary.
+            chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
+                if there is not a ``sequence_chunk`` ancestor type.
+            raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present
+                in the qualifiers will lead to an exception and not a warning.
+
+        Yields:
+            :class:`~biocantor.io.gff3.rows.GFFRow`
+
+        Raises:
+            NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
+            ``sequence_chunk`` ancestor type.
+            GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
+        """
+        yield from self._to_gff_or_gtf(
+            parent,
+            parent_qualifiers,
+            chromosome_relative_coordinates,
+            raise_on_reserved_attributes,
+            GFFRow,
+            GFFAttributes,
+        )
+
+    def to_gtf(
+        self,
+        parent: Optional[str] = None,
+        parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
+        chromosome_relative_coordinates: bool = True,
+    ) -> Iterator[GTFRow]:
+        """Writes a GTF format list of lists for this CDS.
+
+        The additional qualifiers are used when writing a hierarchical relationship back to files. GTF files
+        are easier to work with if the children features have the qualifiers of their parents.
+
+        Args:
+            parent: ID of the Parent of this transcript.
+            parent_qualifiers: Directly pull qualifiers in from this dictionary.
+            chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
+                if there is not a ``sequence_chunk`` ancestor type.
+
+        Yields:
+            :class:`~biocantor.io.gff3.rows.GFFRow`
+
+        Raises:
+            NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
+            ``sequence_chunk`` ancestor type.
+            GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
+        """
+        yield from self._to_gff_or_gtf(
+            parent,
+            parent_qualifiers,
+            chromosome_relative_coordinates,
+            False,
+            GTFRow,
+            GTFAttributes,
+        )
+
     @property
     def has_canonical_start_codon(self) -> bool:
         """Does this CDS have a canonical valid start? Requires a sequence be associated."""
@@ -717,7 +766,6 @@ def _prepare_multi_exon_window_for_scan_codon_locations(
         loc = self.chromosome_location
         # zip_longest is used here to ensure that the two iterators are always actually in sync
         for exon, frame in zip_longest(self._exon_iter(False), self._frame_iter(False)):
-
             if exon is None or frame is None:
                 raise MismatchedFrameException("Frame iterator is not in sync with exon iterator")
 

diff --git a/inscripta/biocantor/gene/cds_frame.py → biocantor/gene/cds_frame.py b/inscripta/biocantor/gene/cds_frame.py → biocantor/gene/cds_frame.py
diff --git a/inscripta/biocantor/gene/codon.py → biocantor/gene/codon.py b/inscripta/biocantor/gene/codon.py → biocantor/gene/codon.py
@@ -1,10 +1,10 @@
 from enum import IntEnum
 from typing import TYPE_CHECKING, List, Optional, Union
 
-from inscripta.biocantor.constants import gencode, extended_gencode, aacodons
+from biocantor.constants import gencode, extended_gencode, aacodons
 
 if TYPE_CHECKING:
-    from inscripta.biocantor.sequence.sequence import Sequence
+    from biocantor.sequence.sequence import Sequence
 
 
 class TranslationTable(IntEnum):