diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index a9b52f8..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,28 +0,0 @@ -[bumpversion] -current_version = 0.0.2-dev -commit = True -tag = False -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))? -serialize = - {major}.{minor}.{patch}-{release}+{build} - {major}.{minor}.{patch}+{build} - {major}.{minor}.{patch}-{release} - {major}.{minor}.{patch} - -[bumpversion:part:release] -optional_value = production -first_value = dev -values = - dev - production - -[bumpverion:part:build] -values = [0-9A-Za-z-]+ - -[bumpversion:file:setup.cfg] -search = version = {current_version} -replace = version = {new_version} - -[bumpversion:file:src/versalign/version.py] -search = VERSION = "{current_version}" -replace = VERSION = "{new_version}" diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md deleted file mode 100644 index 10fa25a..0000000 --- a/.github/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,132 +0,0 @@ -# Contributor Covenant Code of Conduct - -## Our Pledge - -We as members, contributors, and leaders pledge to make participation in our -community a harassment-free experience for everyone, regardless of age, body -size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, -nationality, personal appearance, race, caste, color, religion, or sexual -identity and orientation. - -We pledge to act and interact in ways that contribute to an open, welcoming, -diverse, inclusive, and healthy community. - -## Our Standards - -Examples of behavior that contributes to a positive environment for our -community include: - -* Demonstrating empathy and kindness toward other people -* Being respectful of differing opinions, viewpoints, and experiences -* Giving and gracefully accepting constructive feedback -* Accepting responsibility and apologizing to those affected by our mistakes, - and learning from the experience -* Focusing on what is best not just for us as individuals, but for the overall - community - -Examples of unacceptable behavior include: - -* The use of sexualized language or imagery, and sexual attention or advances of - any kind -* Trolling, insulting or derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or email address, - without their explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of -acceptable behavior and will take appropriate and fair corrective action in -response to any behavior that they deem inappropriate, threatening, offensive, -or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject -comments, commits, code, wiki edits, issues, and other contributions that are -not aligned to this Code of Conduct, and will communicate reasons for moderation -decisions when appropriate. - -## Scope - -This Code of Conduct applies within all community spaces, and also applies when -an individual is officially representing the community in public spaces. -Examples of representing our community include using an official e-mail address, -posting via an official social media account, or acting as an appointed -representative at an online or offline event. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement at -david.meijer@wur.nl. -All complaints will be reviewed and investigated promptly and fairly. - -All community leaders are obligated to respect the privacy and security of the -reporter of any incident. - -## Enforcement Guidelines - -Community leaders will follow these Community Impact Guidelines in determining -the consequences for any action they deem in violation of this Code of Conduct: - -### 1. Correction - -**Community Impact**: Use of inappropriate language or other behavior deemed -unprofessional or unwelcome in the community. - -**Consequence**: A private, written warning from community leaders, providing -clarity around the nature of the violation and an explanation of why the -behavior was inappropriate. A public apology may be requested. - -### 2. Warning - -**Community Impact**: A violation through a single incident or series of -actions. - -**Consequence**: A warning with consequences for continued behavior. No -interaction with the people involved, including unsolicited interaction with -those enforcing the Code of Conduct, for a specified period of time. This -includes avoiding interactions in community spaces as well as external channels -like social media. Violating these terms may lead to a temporary or permanent -ban. - -### 3. Temporary Ban - -**Community Impact**: A serious violation of community standards, including -sustained inappropriate behavior. - -**Consequence**: A temporary ban from any sort of interaction or public -communication with the community for a specified period of time. No public or -private interaction with the people involved, including unsolicited interaction -with those enforcing the Code of Conduct, is allowed during this period. -Violating these terms may lead to a permanent ban. - -### 4. Permanent Ban - -**Community Impact**: Demonstrating a pattern of violation of community -standards, including sustained inappropriate behavior, harassment of an -individual, or aggression toward or disparagement of classes of individuals. - -**Consequence**: A permanent ban from any sort of public interaction within the -community. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 2.1, available at -[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. - -Community Impact Guidelines were inspired by -[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. - -For answers to common questions about this code of conduct, see the FAQ at -[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at -[https://www.contributor-covenant.org/translations][translations]. - -[homepage]: https://www.contributor-covenant.org -[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html -[Mozilla CoC]: https://github.com/mozilla/diversity -[FAQ]: https://www.contributor-covenant.org/faq -[translations]: https://www.contributor-covenant.org/translations \ No newline at end of file diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md deleted file mode 100644 index 9872967..0000000 --- a/.github/CONTRIBUTING.md +++ /dev/null @@ -1,106 +0,0 @@ -# Contributing - -Contributions to this repository are welcomed and encouraged. - -## Code Contribution - -This project uses the [GitHub Flow](https://guides.github.com/introduction/flow) -model for code contributions. Follow these steps: - -1. [Create a fork](https://help.github.com/articles/fork-a-repo) of the upstream - repository at [`davidmeijer/versalign`](https://github.com/davidmeijer/versalign) - on your GitHub account (or in one of your organizations) -2. [Clone your fork](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) - with `git clone https://github.com//versalign.git` -3. Make and commit changes to your fork with `git commit` -4. Push changes to your fork with `git push` -5. Repeat steps 3 and 4 as needed -6. Submit a pull request back to the upstream repository - -### Merge Model - -This repository uses [squash merges](https://docs.github.com/en/github/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-pull-request-commits) -to group all related commits in a given pull request into a single commit upon -acceptance and merge into the main branch. This has several benefits: - -1. Keeps the commit history on the main branch focused on high-level narrative -2. Enables people to make lots of small commits without worrying about muddying - up the commit history -3. Commits correspond 1-to-1 with pull requests - -### Code Style - -This project encourages the use of optional static typing. It -uses [`mypy`](http://mypy-lang.org/) as a type checker -and [`sphinx_autodoc_typehints`](https://github.com/agronholm/sphinx-autodoc-typehints) -to automatically generate documentation based on type hints. You can check if -your code passes `mypy` with `tox -e mypy`. - -This project uses [`black`](https://github.com/psf/black) to automatically -enforce a consistent code style. You can apply `black` and other pre-configured -linters with `tox -e lint`. - -This project uses [`flake8`](https://flake8.pycqa.org) and several plugins for -additional checks of documentation style, security issues, good variable -nomenclature, and more ( -see [`tox.ini`](tox.ini) for a list of flake8 plugins). You can check if your -code passes `flake8` with `tox -e flake8`. - -Each of these checks are run on each commit using GitHub Actions as a continuous -integration service. Passing all of them is required for accepting a -contribution. If you're unsure how to address the feedback from one of these -tools, please say so either in the description of your pull request or in a -comment, and we will help you. - -### Logging - -Python's builtin `print()` should not be used (except when writing to files), -it's checked by the -[`flake8-print`](https://github.com/jbkahn/flake8-print) plugin to `flake8`. If -you're in a command line setting or `main()` function for a module, you can use -`click.echo()`. Otherwise, you can use the builtin `logging` module by adding -`logger = logging.getLogger(__name__)` below the imports at the top of your -file. - -### Documentation - -All public functions (i.e., not starting with an underscore `_`) must be -documented using the [sphinx documentation format](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html#the-sphinx-docstring-format). -The [`darglint`](https://github.com/terrencepreilly/darglint) plugin to `flake8` -reports on functions that are not fully documented. - -### Testing - -Functions in this repository should be unit tested. These can either be written -using the `unittest` framework in the `tests/` directory or as embedded -doctests. You can check that the unit tests pass with `tox -e py` and that the -doctests pass with `tox -e doctests`. These tests are required to pass for -accepting a contribution. - -### Syncing your fork - -If other code is updated before your contribution gets merged, you might need to -resolve conflicts against the main branch. After cloning, you should add the -upstream repository with - -```shell -$ git remote add davidmeijer https://github.com/davidmeijer/versalign.git -``` - -Then, you can merge upstream code into your branch. You can also use the GitHub -UI to do this by following [this tutorial](https://docs.github.com/en/github/collaborating-with-pull-requests/working-with-forks/syncing-a-fork). - -### Python Version Compatibility - -This project aims to support all versions of Python that have not passed their -end-of-life dates. After end-of-life, the version will be removed from the Trove -qualifiers in the [`setup.cfg`](setup.cfg) and from the GitHub Actions testing -configuration. - -See https://endoflife.date/python for a timeline of Python release and -end-of-life dates. - -## Acknowledgements - -These code contribution guidelines are derived from the [cthoyt/cookiecutter-snekpack](https://github.com/cthoyt/cookiecutter-snekpack) -Python package template. They're free to reuse and modify as long as they're properly acknowledged. \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 183c367..8dc28f4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,52 +1,40 @@ -name: Tests +name: testing & quality on: push: - branches: [ main, dev ] + branches: [ main ] + pull_request: + branches: [ main ] jobs: - lint: - name: Lint - strategy: - matrix: - os: [ ubuntu-latest ] - python-version: [ "3.10" ] + qa: + name: linting and tests runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - sudo apt-get install graphviz - pip install tox - - name: Check code quality with flake8 - run: tox run -e flake8 - - name: Check package metadata with Pyroma - run: tox run -e pyroma - - name: Check static typing with MyPy - run: tox run -e mypy - tests: - name: Tests strategy: matrix: os: [ ubuntu-latest ] python-version: [ "3.10" ] - runs-on: ${{ matrix.os }} + steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + - name: check out code + uses: actions/checkout@v4 + + - name: set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: pip install tox tox-uv - - name: Test with pytest and generate coverage file - run: - tox run -e py - - name: Upload coverage report to codecov - uses: codecov/codecov-action@v4.0.1 - with: - token: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file + cache: "pip" + + - name: install package with dev dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + + - name: ruff lint + run: ruff check src + + - name: ruff format check + run: ruff format --check src + + - name: run tests + run: pytest -q \ No newline at end of file diff --git a/.gitignore b/.gitignore index 6b9ce90..3dcdffe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,908 +1,8 @@ -# Created by https://www.toptal.com/developers/gitignore/api/macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio -# Edit at https://www.toptal.com/developers/gitignore?templates=macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio - -### Emacs ### -# -*- mode: gitignore; -*- -*~ -\#*\# -/.emacs.desktop -/.emacs.desktop.lock -*.elc -auto-save-list -tramp -.\#* - -# Org-mode -.org-id-locations -*_archive - -# flymake-mode -*_flymake.* - -# eshell files -/eshell/history -/eshell/lastdir - -# elpa packages -/elpa/ - -# reftex files -*.rel - -# AUCTeX auto folder -/auto/ - -# cask packages -.cask/ -dist/ - -# Flycheck -flycheck_*.el - -# server auth directory -/server/ - -# projectiles files -.projectile - -# directory configuration -.dir-locals.el - -# network security -/network-security.data - - -### JetBrains ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# AWS User-specific -.idea/**/aws.xml - -# Generated files -.idea/**/contentModel.xml - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/artifacts -# .idea/compiler.xml -# .idea/jarRepositories.xml -# .idea/modules.xml -# .idea/*.iml -# .idea/modules -# *.iml -# *.ipr - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# SonarLint plugin -.idea/sonarlint/ - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/httpRequests - -# Android studio 3.1+ serialized cache file -.idea/caches/build_file_checksums.ser - -### JetBrains Patch ### -# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 - -# *.iml -# modules.xml -# .idea/misc.xml -# *.ipr - -# Sonarlint plugin -# https://plugins.jetbrains.com/plugin/7973-sonarlint -.idea/**/sonarlint/ - -# SonarQube Plugin -# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin -.idea/**/sonarIssues.xml - -# Markdown Navigator plugin -# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced -.idea/**/markdown-navigator.xml -.idea/**/markdown-navigator-enh.xml -.idea/**/markdown-navigator/ - -# Cache file creation bug -# See https://youtrack.jetbrains.com/issue/JBR-2257 -.idea/$CACHE_FILE$ - -# CodeStream plugin -# https://plugins.jetbrains.com/plugin/12206-codestream -.idea/codestream.xml - -### JupyterNotebooks ### -# gitignore template for Jupyter Notebooks -# website: http://jupyter.org/ - -.ipynb_checkpoints -*/.ipynb_checkpoints/* - -# IPython -profile_default/ -ipython_config.py - -# Remove previous ipynb_checkpoints -# git rm -r .ipynb_checkpoints/ - -### Linux ### - -# temporary files which can be created if a process still has a handle open of a deleted file -.fuse_hidden* - -# KDE directory preferences -.directory - -# Linux trash folder which might appear on any partition or disk -.Trash-* - -# .nfs files are created when an open file is removed but is still being accessed -.nfs* - -### macOS ### -# General -.DS_Store -.AppleDouble -.LSOverride - -# Icon must end with two \r -Icon - - -# Thumbnails -._* - -# Files that might appear in the root of a volume -.DocumentRevisions-V100 -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns -.com.apple.timemachine.donotpresent - -# Directories potentially created on remote AFP share -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk - -### PyCharm ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff - -# AWS User-specific - -# Generated files - -# Sensitive or high-churn files - -# Gradle - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/artifacts -# .idea/compiler.xml -# .idea/jarRepositories.xml -# .idea/modules.xml -# .idea/*.iml -# .idea/modules -# *.iml -# *.ipr - -# CMake - -# Mongo Explorer plugin - -# File-based project format - -# IntelliJ - -# mpeltonen/sbt-idea plugin - -# JIRA plugin - -# Cursive Clojure plugin - -# SonarLint plugin - -# Crashlytics plugin (for Android Studio and IntelliJ) - -# Editor-based Rest Client - -# Android studio 3.1+ serialized cache file - -### PyCharm Patch ### -# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 - -# *.iml -# modules.xml -# .idea/misc.xml -# *.ipr - -# Sonarlint plugin -# https://plugins.jetbrains.com/plugin/7973-sonarlint - -# SonarQube Plugin -# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin - -# Markdown Navigator plugin -# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced - -# Cache file creation bug -# See https://youtrack.jetbrains.com/issue/JBR-2257 - -# CodeStream plugin -# https://plugins.jetbrains.com/plugin/12206-codestream - -### Python ### -# Byte-compiled / optimized / DLL files +.pytest_cache __pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ -docs/build -docs/source/api - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook - -# IPython - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv +.DS_Store +.vscode/ env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -### Vim ### -# Swap -[._]*.s[a-v][a-z] -!*.svg # comment out if you don't need vector files -[._]*.sw[a-p] -[._]s[a-rt-v][a-z] -[._]ss[a-gi-z] -[._]sw[a-p] - -# Session -Session.vim -Sessionx.vim - -# Temporary -.netrwhist -# Auto-generated tag files -tags -# Persistent undo -[._]*.un~ - -### VisualStudioCode ### -.vscode -.vscode/* -!.vscode/settings.json -!.vscode/tasks.json -!.vscode/launch.json -!.vscode/extensions.json -!.vscode/*.code-snippets - -# Local History for Visual Studio Code -.history/ - -# Built Visual Studio Code Extensions -*.vsix - -### VisualStudioCode Patch ### -# Ignore all local history of files -.history -.ionide - -# Support for Project snippet scope - -### Windows ### -# Windows thumbnail cache files -Thumbs.db -Thumbs.db:encryptable -ehthumbs.db -ehthumbs_vista.db - -# Dump file -*.stackdump - -# Folder config file -[Dd]esktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msix -*.msm -*.msp - -# Windows shortcuts -*.lnk - -### VisualStudio ### -## Ignore Visual Studio temporary files, build results, and -## files generated by popular Visual Studio add-ons. -## -## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore - -# User-specific files -*.rsuser -*.suo -*.user -*.userosscache -*.sln.docstates - -# User-specific files (MonoDevelop/Xamarin Studio) -*.userprefs - -# Mono auto generated files -mono_crash.* - -# Build results -[Dd]ebug/ -[Dd]ebugPublic/ -[Rr]elease/ -[Rr]eleases/ -x64/ -x86/ -[Ww][Ii][Nn]32/ -[Aa][Rr][Mm]/ -[Aa][Rr][Mm]64/ -bld/ -[Bb]in/ -[Oo]bj/ -[Ll]og/ -[Ll]ogs/ - -# Visual Studio 2015/2017 cache/options directory -.vs/ -# Uncomment if you have tasks that create the project's static files in wwwroot -#wwwroot/ - -# Visual Studio 2017 auto generated files -Generated\ Files/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -# NUnit -*.VisualState.xml -TestResult.xml -nunit-*.xml - -# Build Results of an ATL Project -[Dd]ebugPS/ -[Rr]eleasePS/ -dlldata.c - -# Benchmark Results -BenchmarkDotNet.Artifacts/ - -# .NET Core -project.lock.json -project.fragment.lock.json -artifacts/ - -# ASP.NET Scaffolding -ScaffoldingReadMe.txt - -# StyleCop -StyleCopReport.xml - -# Files built by Visual Studio -*_i.c -*_p.c -*_h.h -*.ilk -*.meta -*.obj -*.iobj -*.pch -*.pdb -*.ipdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*_wpftmp.csproj -*.tlog -*.vspscc -*.vssscc -.builds -*.pidb -*.svclog -*.scc - -# Chutzpah Test files -_Chutzpah* - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opendb -*.opensdf -*.sdf -*.cachefile -*.VC.db -*.VC.VC.opendb - -# Visual Studio profiler -*.psess -*.vsp -*.vspx -*.sap - -# Visual Studio Trace Files -*.e2e - -# TFS 2012 Local Workspace -$tf/ - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper -*.DotSettings.user - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# AxoCover is a Code Coverage Tool -.axoCover/* -!.axoCover/settings.json - -# Coverlet is a free, cross platform Code Coverage Tool -coverage*.json -coverage*.xml -coverage*.info - -# Visual Studio code coverage results -*.coverage -*.coveragexml - -# NCrunch -_NCrunch_* -.*crunch*.local.xml -nCrunchTemp_* - -# MightyMoose -*.mm.* -AutoTest.Net/ - -# Web workbench (sass) -.sass-cache/ - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.[Pp]ublish.xml -*.azurePubxml -# Note: Comment the next line if you want to checkin your web deploy settings, -# but database connection strings (with potential passwords) will be unencrypted -*.pubxml -*.publishproj - -# Microsoft Azure Web App publish settings. Comment the next line if you want to -# checkin your Azure Web App publish settings, but sensitive information contained -# in these scripts will be unencrypted -PublishScripts/ - -# NuGet Packages -*.nupkg -# NuGet Symbol Packages -*.snupkg -# The packages folder can be ignored because of Package Restore -**/[Pp]ackages/* -# except build/, which is used as an MSBuild target. -!**/[Pp]ackages/build/ -# Uncomment if necessary however generally it will be regenerated when needed -#!**/[Pp]ackages/repositories.config -# NuGet v3's project.json files produces more ignorable files -*.nuget.props -*.nuget.targets - -# Microsoft Azure Build Output -csx/ -*.build.csdef - -# Microsoft Azure Emulator -ecf/ -rcf/ - -# Windows Store app package directories and files -AppPackages/ -BundleArtifacts/ -Package.StoreAssociation.xml -_pkginfo.txt -*.appx -*.appxbundle -*.appxupload - -# Visual Studio cache files -# files ending in .cache can be ignored -*.[Cc]ache -# but keep track of directories ending in .cache -!?*.[Cc]ache/ - -# Others -ClientBin/ -~$* -*.dbmdl -*.dbproj.schemaview -*.jfm -*.pfx -*.publishsettings -orleans.codegen.cs - -# Including strong name files can present a security risk -# (https://github.com/github/gitignore/pull/2483#issue-259490424) -#*.snk - -# Since there are multiple workflows, uncomment next line to ignore bower_components -# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) -#bower_components/ - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file -# to a newer Visual Studio version. Backup files are not needed, -# because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm -ServiceFabricBackup/ -*.rptproj.bak - -# SQL Server files -*.mdf -*.ldf -*.ndf - -# Business Intelligence projects -*.rdl.data -*.bim.layout -*.bim_*.settings -*.rptproj.rsuser -*- [Bb]ackup.rdl -*- [Bb]ackup ([0-9]).rdl -*- [Bb]ackup ([0-9][0-9]).rdl - -# Microsoft Fakes -FakesAssemblies/ - -# GhostDoc plugin setting file -*.GhostDoc.xml - -# Node.js Tools for Visual Studio -.ntvs_analysis.dat -node_modules/ - -# Visual Studio 6 build log -*.plg - -# Visual Studio 6 workspace options file -*.opt - -# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) -*.vbw - -# Visual Studio 6 auto-generated project file (contains which files were open etc.) -*.vbp - -# Visual Studio 6 workspace and project file (working project files containing files to include in project) -*.dsw -*.dsp - -# Visual Studio 6 technical files - -# Visual Studio LightSwitch build output -**/*.HTMLClient/GeneratedArtifacts -**/*.DesktopClient/GeneratedArtifacts -**/*.DesktopClient/ModelManifest.xml -**/*.Server/GeneratedArtifacts -**/*.Server/ModelManifest.xml -_Pvt_Extensions - -# Paket dependency manager -.paket/paket.exe -paket-files/ - -# FAKE - F# Make -.fake/ - -# CodeRush personal settings -.cr/personal - -# Python Tools for Visual Studio (PTVS) *.pyc - -# Cake - Uncomment if you are using it -# tools/** -# !tools/packages.config - -# Tabs Studio -*.tss - -# Telerik's JustMock configuration file -*.jmconfig - -# BizTalk build output -*.btp.cs -*.btm.cs -*.odx.cs -*.xsd.cs - -# OpenCover UI analysis results -OpenCover/ - -# Azure Stream Analytics local run output -ASALocalRun/ - -# MSBuild Binary and Structured Log -*.binlog - -# NVidia Nsight GPU debugger configuration file -*.nvuser - -# MFractors (Xamarin productivity tool) working folder -.mfractor/ - -# Local History for Visual Studio -.localhistory/ - -# Visual Studio History (VSHistory) files -.vshistory/ - -# BeatPulse healthcheck temp database -healthchecksdb - -# Backup folder for Package Reference Convert tool in Visual Studio 2017 -MigrationBackup/ - -# Ionide (cross platform F# VS Code tools) working folder -.ionide/ - -# Fody - auto-generated XML schema -FodyWeavers.xsd - -# VS Code files for those working on multiple tools -*.code-workspace - -# Local History for Visual Studio Code - -# Windows Installer files from build outputs - -# JetBrains Rider -*.sln.iml - -### VisualStudio Patch ### -# Additional files built by Visual Studio - -# End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio - -scratch/ - -# Test PyPI -.pypirc - -# Ignore .svg output files in test fixtures -tests/fixtures/*.svg \ No newline at end of file +.idea/ +.ruff_cache/ \ No newline at end of file diff --git a/LICENSE b/LICENSE index a2bd5bd..cb22924 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 David Meijer +Copyright (c) 2025 David Meijer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index b3dd513..8e86c0c 100644 --- a/README.md +++ b/README.md @@ -1,181 +1,45 @@

- - Tests + + testing & quality - PyPI + PyPI - PyPI - Python Version - - PyPI - License - - Cookiecutter template from @cthoyt - - Code style: black - - Contributor Covenant - - DOI + PyPI - Python Version + + PyPI - License +

-Versalign is Python package that allows you to create multiple sequence alignments for arbitrary lists of objects. +Versalign is a naive alignment tool for lists of arbitrary objects. Versalign is able to perform pairwise sequence alignments and star-based multiple sequence alignments, based on custom scoring functions. Versalign is primarily designed to align short-ish sequences. -## 💪 Getting Started +Versalign is a Python library and has no command line interface. -Pairwise alignment: +Pairwise alignments, which is the core of this library, is built around Biopython's `PairwiseAligner` class. -```python -from versalign.motif import Motif -from versalign.pairwise import PairwiseAlignment, align_pairwise -from versalign.sequence import Sequence - -class A(Motif): - def __eq__(self, other): - return isinstance(other, A) - - def __str__(self): - return "A" - -class B(Motif): - def __eq__(self, other): - return isinstance(other, B) - - def __str__(self): - return "B" - -def score_func(a, b): - if a == b: - return 1 - return -1 - -seq_a = Sequence("seq_a", [A(), A(), A()]) -seq_b = Sequence("seq_b", [B(), B(), B()]) - -aligned_seq_a, aligned_seq_b, score = align_pairwise( - seq_a=seq_a, - seq_b=seq_b, - score_func=score_func, - algorithm=PairwiseAlignment.NEEDLEMAN_WUNSCH, - options={"gap_penalty": 2, "end_gap_penalty": 1}, -) - -print(aligned_seq_a) -print(aligned_seq_b) - ->> AAA--- ->> ---BBB -``` - -Multiple sequence alignment: - -```python -from versalign.msa import multiple_sequence_alignment - -seq_a = Sequence("seq_a", [A(), A(), A()]) -seq_b = Sequence("seq_b", [B(), B(), B()]) -seq_c = Sequence("seq_c", [A(), B(), B()]) - -result = multiple_sequence_alignment( - seqs=[seq_a, seq_b, seq_c], - gap_penalty=2, - end_gap_penalty=1, - score_func=score_func, -) - -for seq in result: - print(seq) - ->> ---BBB ->> --ABB- ->> AAA--- -``` - -## 🚀 Installation - -The most recent release can be installed from -[PyPI](https://pypi.org/project/versalign/) with: - -```shell -pip install versalign -``` +## Installation The most recent code and data can be installed directly from GitHub with: ```shell -pip install git+https://github.com/davidmeijer/versalign.git +pip install git+https://github.com/moltools/versalign.git ``` -## 👐 Contributing - -Contributions, whether filing an issue, making a pull request, or forking, are appreciated. See -[CONTRIBUTING.md](https://github.com/davidmeijer/versalign/blob/main/.github/CONTRIBUTING.md) for more information on getting involved. - -## 👋 Attribution - -### ⚖️ License - -The code in this package is licensed under the MIT License. - -### 🍪 Cookiecutter - -This package was created with [@audreyfeldroy](https://github.com/audreyfeldroy)'s -[cookiecutter](https://github.com/cookiecutter/cookiecutter) package using [@cthoyt](https://github.com/cthoyt)'s -[cookiecutter-snekpack](https://github.com/cthoyt/cookiecutter-snekpack) template. - -## 🛠️ For Developers - -
- See developer instructions - -The final section of the README is for if you want to get involved by making a code contribution. - -### Development Installation - -To install in development mode, use the following: - -```bash -git clone git+https://github.com/davidmeijer/versalign.git -cd versalign -pip install -e . -``` - -### 🥼 Testing - -After cloning the repository and installing `tox` with `pip install tox`, the unit tests in the `tests/` folder can be -run reproducibly with: - -```shell -tox -``` - -Additionally, these tests are automatically re-run with each commit in a -[GitHub Action](https://github.com/davidmeijer/versalign/actions?query=workflow%3ATests). - -### 📦 Making a Release - -After installing the package in development mode and installing -`tox` with `pip install tox`, the commands for making a new release are contained within the `finish` environment -in `tox.ini`. Run the following from the shell: +The latest stable release can be installed from PyPI with: ```shell -tox -e finish +pip install versalign ``` -This script does the following: +Versalign has been developed for Linux and MacOS. -1. Uses [Bump2Version](https://github.com/c4urself/bump2version) to switch the version number in the `setup.cfg`, - `src/versalign/version.py`, and [`docs/source/conf.py`](docs/source/conf.py) to not have the `-dev` suffix -2. Packages the code in both a tar archive and a wheel using [`build`](https://github.com/pypa/build) -3. Uploads to PyPI using [`twine`](https://github.com/pypa/twine). Be sure to have a `.pypirc` file - configured to avoid the need for manual input at this step -4. Push to GitHub. You'll need to make a release going with the commit where the version was bumped. -5. Bump the version to the next patch. If you made big changes and want to bump the version by minor, you can - use `tox -e bumpversion -- minor` after. +## Getting started -
\ No newline at end of file +See the [examples](examples/) folder for some basic usage examples. diff --git a/examples/align_sequences.py b/examples/align_sequences.py new file mode 100644 index 0000000..91265ab --- /dev/null +++ b/examples/align_sequences.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 + +"""Example script that shows how to align two or more sequences using versalign.""" + + +from versalign.aligner import setup_aligner +from versalign.msa import calc_msa +from versalign.pairwise import calc_pairwise_alignment +from versalign.printing import format_alignment +from versalign.scoring import create_substituion_matrix_dynamically + + +def main() -> None: + """Main function to demonstrate sequence alignment.""" + + # --- CASE 2 --- + # Simple equality scoring ACGT + objs = list("ACGT-") + sm, _ = create_substituion_matrix_dynamically(objs) + aligner = setup_aligner(sm, "global") + + # Define example sequences + seq1, seq2, seq3 = list("ACGT"), list("ACGGGGGT"), list("ATCCT") + + # Pairwise alignment + s, aln1, aln2 = calc_pairwise_alignment(aligner, seq1, seq2, gap_repr='-') + print("\nPairwise alignment (equality scoring ACGT):") + print(format_alignment([aln1, aln2], names=["seq1", "seq2"], score=s)) + + # Multiple sequence alignment + seqs = [seq1, seq2, seq3] + lbls = ["seq1", "seq2", "seq3"] + msa, order = calc_msa(aligner, seqs, gap_repr='-') + lbls_ordered = [lbls[i] for i in order] + print("\nMultiple sequence alignment (equality scoring ACGT):") + print(format_alignment(msa, names=lbls_ordered)) + + # --- CASE 2 --- + # DNA: match=+2, transition=-1, transversion=-2, gap=-3 + purines = {"A", "G"} + pyrimidines = {"C", "T"} + + def dna_ti_tv_compare(a: str, b: str) -> int: + if a == b: return 2 + if "-" in (a, b): return -3 # if you include gap in objs + if (a in purines and b in purines) or (a in pyrimidines and b in pyrimidines): + return -1 # transition + return -2 # transversion + + objs = list("ACGT-") + sm, _ = create_substituion_matrix_dynamically(objs, compare=dna_ti_tv_compare) + aligner = setup_aligner(sm, "global") + + # Define example sequences + seq1, seq2 = list("ACGTAG"), list("ATCCTAG") + + # Pairwise alignment + s, aln1, aln2 = calc_pairwise_alignment(aligner, seq1, seq2, gap_repr='-') + print("\nPairwise alignment (DNA scoring):") + print(format_alignment([aln1, aln2], names=["seq1", "seq2"], score=s)) + + # --- CASE 3 --- + # Proteins by coarse chemical classes + hydrophobic = set("AILMVFWY") + polar = set("STNQ") + positive = set("KRH") + negative = set("DE") + special = set("CGP") + + def aa_class_compare(a: str, b: str) -> int: + if a == b: return 3 + if "-" in (a, b): return -5 + def cls(x: str) -> str: + if x in hydrophobic: return "hydro" + if x in polar: return "polar" + if x in positive: return "pos" + if x in negative: return "neg" + if x in special: return "special" + return "other" + return 1 if cls(a) == cls(b) else -2 + + objs = list("ACDEFGHIKLMNPQRSTVWY-") + sm, _ = create_substituion_matrix_dynamically(objs, compare=aa_class_compare) + aligner = setup_aligner(sm, "global") + + # Define example sequences + seq1 = list("ACDEFGHIKLMNPQRSTVWY") + seq2 = list("ACDFGHIKLMNQRSTVWA") + + # Pairwise alignment + s, aln1, aln2 = calc_pairwise_alignment(aligner, seq1, seq2, gap_repr='-') + print("\nPairwise alignment (AA class scoring):") + print(format_alignment([aln1, aln2], names=["seq1", "seq2"], score=s)) + + # --- CASE 4 --- + # Compare arbitrary objects (e.g., residues) via a property (mass) + + class Residue: + def __init__(self, name: str, mass: float): + self.name = name + self.mass = mass + + def residue_compare(a: Residue | str, b: Residue | str) -> float: + if "-" in (a, b): return -5.0 + return -abs(a.mass - b.mass) + + def label_fn (r: Residue | str) -> str: + return r if r == "-" else r.name + + residues = [Residue("X", 100.0), Residue("Y", 101.3), Residue("Z", 97.8), "-"] + sm, _ = create_substituion_matrix_dynamically(residues, compare=residue_compare, label_fn=label_fn) + aligner = setup_aligner(sm, "global", label_fn=label_fn) + + # Define example sequences + seq1 = [residues[0], residues[1], residues[2], residues[0]] # X Y Z X + seq2 = [residues[1], residues[2], residues[2], residues[0]] # Y Z Z X + seq3 = [residues[0], residues[0]] # X X + seq4 = [residues[2], residues[0], residues[0]] # Z X X + + # Pairwise alignment + s, aln1, aln2 = calc_pairwise_alignment(aligner, seq1, seq2, gap_repr='-') + print("\nPairwise alignment (residue mass scoring):") + print(format_alignment([aln1, aln2], names=["seq1", "seq2"], score=s)) + + # Multiple sequence alignment + seqs = [seq1, seq2, seq3, seq4] + lbls = ["seq1", "seq2", "seq3", "seq4"] + msa, order = calc_msa(aligner, seqs, gap_repr='-') + lbls_ordered = [lbls[i] for i in order] + print("\nMultiple sequence alignment (residue mass scoring):") + print(format_alignment(msa, names=lbls_ordered)) + + # Multiple sequence alignment with seq1 fixed as center star; makes sure that rest is aligned to seq1 + center_star_idx = 0 + msa, order = calc_msa(aligner, seqs, gap_repr='-', center_star=center_star_idx) + lbls_ordered = [lbls[i] for i in order] + print("\nMultiple sequence alignment with fixed center star (residue mass scoring):") + print(format_alignment(msa, names=lbls_ordered)) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 4d39683..3a11703 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,57 @@ -# See https://setuptools.readthedocs.io/en/latest/build_meta.html +[project] +name = "versalign" +version = "1.0.0" +requires-python = ">=3.10" + +dependencies = [ + "biopython", + "numpy", + "pandas" +] + [build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta:__legacy__" - -[tool.black] -line-length = 100 -target-version = ["py310"] - -[tool.isort] -profile = "black" -multi_line_output = 3 -line_length = 100 -include_trailing_comma = true -reverse_relative = true \ No newline at end of file +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project.optional-dependencies] +dev = [ + "pytest", + "ruff" +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/versalign"] +sources = ["src"] + +[tool.hatch.build] +include = [] + +# ------------------------- +# Hatch env + scripts +# ------------------------- +[tool.hatch.envs.dev] +features = ["dev"] +python = "3.10" + +[tool.hatch.envs.dev.scripts] +lint = "ruff check ." +fmt = "ruff format ." +test = "pytest -q" + +# ------------------------- +# Ruff +# ------------------------- +[tool.ruff] +line-length = 120 +target-version = "py310" +extend-exclude = ["build", "dist", ".venv", "venv", "typings"] + +[tool.ruff.lint] +select = ["E", "F", "I", "UP", "B"] +ignore = [] + +[tool.ruff.lint.per-file-ignores] +"tests/data/**/*" = ["E501"] \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index d5113b4..0000000 --- a/setup.cfg +++ /dev/null @@ -1,173 +0,0 @@ -########################## -# Setup.py Configuration # -########################## -[metadata] -name = versalign -version = 0.0.2-dev -description = Multiple sequence aligner for arbitrary objects. -long_description = file: README.md -long_description_content_type = text/markdown - -# URLs associated with the project -url = https://github.com/davidmeijer/versalign -download_url = https://github.com/davidmeijer/versalign/releases -project_urls = - Tracker = https://github.com/davidmeijer/versalign/issues - Source = https://github.com/davidmeijer/versalign - -# Author information -author = David Meijer -author_email = david.meijer@wur.nl -maintainer = David Meijer -maintainer_email = david.meijer@wur.nl - -# License Information -license = MIT -license_files = - LICENSE - -# Search tags -classifiers = - Development Status :: 1 - Planning - Environment :: Console - Intended Audience :: Developers - License :: OSI Approved :: MIT License - Operating System :: OS Independent - Framework :: Pytest - Framework :: tox - Framework :: Sphinx - Programming Language :: Python - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - Programming Language :: Python :: 3 :: Only -keywords = - snekpack - cookiecutter - pairwise alignment - multiple sequence alignment - -[options] -install_requires = - # Missing itertools from the standard library you didn't know you needed - more_itertools - # Use progress bars excessively - tqdm - # Command line tools - click - more_click - # Other - numpy - scipy - -# Random options -zip_safe = false -include_package_data = True -python_requires = >=3.10 - -# Where is my code -packages = find: -package_dir = - = src - -[options.packages.find] -where = src - -[options.extras_require] -tests = - pytest - coverage -docs = - sphinx - sphinx-rtd-theme - sphinx-click - sphinx_automodapi - autodoc_pydantic - # To include LaTeX comments easily in your docs - texext - -###################### -# Doc8 Configuration # -# (doc8.ini) # -###################### -[doc8] -max-line-length = 120 - -########################## -# Coverage Configuration # -# (.coveragerc) # -########################## -[coverage:run] -branch = True -source = versalign -omit = - tests/* - docs/* - -[coverage:paths] -source = - src/versalign - .tox/*/lib/python*/site-packages/versalign - -[coverage:report] -show_missing = True -exclude_lines = - pragma: no cover - raise NotImplementedError - if __name__ == "__main__": - if TYPE_CHECKING: - def __str__ - def __repr__ - -########################## -# Darglint Configuration # -########################## -[darglint] -docstring_style = sphinx -strictness = short - -####################### -# MyPy Configuration # -####################### -[mypy] -plugins = pydantic.mypy - -######################### -# Flake8 Configuration # -# (.flake8) # -######################### -[flake8] -ignore = - # pickle - S301 - # pickle - S403 - S404 - S603 - # Line break before binary operator (conflicts with black) - W503 - # Multiple statements on one line (conflicts with black) - E704 - # whitespace before ':' (conflicts with black) - E203 - # Requests call without timeout - S113 -exclude = - .tox, - .git, - __pycache__, - docs/source/conf.py, - build, - dist, - tests/fixtures/*, - *.pyc, - *.egg-info, - .cache, - .eggs, - data -max-line-length = 120 -max-complexity = 20 -import-order-style = pycharm -application-import-names = - versalign - tests \ No newline at end of file diff --git a/src/versalign/__init__.py b/src/versalign/__init__.py deleted file mode 100644 index 7355154..0000000 --- a/src/versalign/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Versalign is Python package that allows you to create multiple sequence alignments for arbitrary lists of objects.""" - -from .version import get_version - -__all__ = [ - "get_version", -] diff --git a/src/versalign/aligner.py b/src/versalign/aligner.py new file mode 100644 index 0000000..76313e6 --- /dev/null +++ b/src/versalign/aligner.py @@ -0,0 +1,81 @@ +"""Aligner module for pairwise sequence alignment.""" + +from collections.abc import Callable, Hashable +from dataclasses import dataclass + +from Bio.Align import PairwiseAligner, substitution_matrices + +from versalign.scoring import T + +# Define __all__ for explicit export of PairwiseAligner class +__all__ = ["PairwiseAligner", "substitution_matrices"] + + +@dataclass +class Aligner: + """Dataclass to hold a PairwiseAligner and its label function.""" + + aligner: PairwiseAligner + label_fn: Callable[[T], Hashable] | None = None + + +def setup_aligner( + substitution_matrix: substitution_matrices.Array, + mode: str = "global", + target_internal_open_gap_score: float = -1.0, + target_internal_extend_gap_score: float = -1.0, + target_left_open_gap_score: float = -1.0, + target_left_extend_gap_score: float = -1.0, + target_right_open_gap_score: float = -1.0, + target_right_extend_gap_score: float = -1.0, + query_internal_open_gap_score: float = -1.0, + query_internal_extend_gap_score: float = -1.0, + query_left_open_gap_score: float = -1.0, + query_left_extend_gap_score: float = -1.0, + query_right_open_gap_score: float = -1.0, + query_right_extend_gap_score: float = -1.0, + label_fn: Callable[[T], Hashable] | None = None, +) -> Aligner: + """ + Setup a PairwiseAligner with the given parameters. + + :param substitution_matrix: substitution matrix to be used for alignment + :param mode: alignment mode ("local" or "global"), defaults to "global" + :param target_internal_open_gap_score: internal open gap score for target sequence (default: -1.0) + :param target_internal_extend_gap_score: internal extend gap score for target sequence (default: -1.0) + :param target_left_open_gap_score: left open gap score for target sequence (default: -1.0) + :param target_left_extend_gap_score: left extend gap score for target sequence (default: -1.0) + :param target_right_open_gap_score: right open gap score for target sequence (default: -1.0) + :param target_right_extend_gap_score: right extend gap score for target sequence (default: -1.0) + :param query_internal_open_gap_score: internal open gap score for query sequence (default: -1.0) + :param query_internal_extend_gap_score: internal extend gap score for query sequence (default: -1.0) + :param query_left_open_gap_score: left open gap score for query sequence (default: -1.0) + :param query_left_extend_gap_score: left extend gap score for query sequence (default: -1.0) + :param query_right_open_gap_score: right open gap score for query sequence (default: -1.0) + :param query_right_extend_gap_score: right extend gap score for query sequence (default: -1.0) + :param label_fn: optional function to label item in sequences (default: None) + :return: PairwiseAligner object + :raises ValueError: if mode is not "global" or "local" + :raises ValueError: if substitution matrix is not provided + """ + if mode not in ["global", "local"]: + raise ValueError("mode must be one of 'global' or 'local'") + + aligner = PairwiseAligner() + aligner.mode = mode + aligner.target_internal_open_gap_score = target_internal_open_gap_score + aligner.target_internal_extend_gap_score = target_internal_extend_gap_score + aligner.target_left_open_gap_score = target_left_open_gap_score + aligner.target_left_extend_gap_score = target_left_extend_gap_score + aligner.target_right_open_gap_score = target_right_open_gap_score + aligner.target_right_extend_gap_score = target_right_extend_gap_score + aligner.query_internal_open_gap_score = query_internal_open_gap_score + aligner.query_internal_extend_gap_score = query_internal_extend_gap_score + aligner.query_left_open_gap_score = query_left_open_gap_score + aligner.query_left_extend_gap_score = query_left_extend_gap_score + aligner.query_right_open_gap_score = query_right_open_gap_score + aligner.query_right_extend_gap_score = query_right_extend_gap_score + aligner.wildcard = None + aligner.substitution_matrix = substitution_matrix + + return Aligner(aligner=aligner, label_fn=label_fn) diff --git a/src/versalign/config.py b/src/versalign/config.py new file mode 100644 index 0000000..cdbd251 --- /dev/null +++ b/src/versalign/config.py @@ -0,0 +1,16 @@ +"""Configuration settings for versalign.""" + +import logging +import os + +# Global logger name for versalign +LOGGER_NAME = "versalign" +LOGGER_LEVEL = int(os.getenv("LOGGER_LEVEL", logging.INFO)) + + +# Configure global logger +logger = logging.getLogger(LOGGER_NAME) +logger.setLevel(LOGGER_LEVEL) + + +DEFAULT_GAP_REPR = "-" diff --git a/src/versalign/helpers.py b/src/versalign/helpers.py new file mode 100644 index 0000000..9cc8313 --- /dev/null +++ b/src/versalign/helpers.py @@ -0,0 +1,47 @@ +"""Helpers for sequence alignment.""" + +from collections.abc import Callable, Hashable + +import numpy as np +from numpy.typing import NDArray + +from versalign.scoring import T + + +def seq_to_arr( + seq: list[str], + alphabet: list[str], + label_fn: Callable[[T], Hashable] | None = None, +) -> NDArray[np.int32]: + """ + Convert a sequence of strings to an array of integers based on the alphabet. + + :param seq: sequence of strings + :param alphabet: list of strings representing the alphabet + :param label_fn: optional function to map sequence items to labels + :return: array of integers + :raises ValueError: if a item in the sequence is not in the alphabet + """ + arr = [] + for item in seq: + try: + if not label_fn: + arr.append(alphabet.index(item)) + else: + labeled_item = label_fn(item) + arr.append(alphabet.index(labeled_item)) + except ValueError as e: + raise ValueError(f"Item '{item}' not found in alphabet") from e + return np.array(arr, dtype=np.int32) + + +def arr_to_seq(arr: NDArray[np.int32], alphabet: list[str]) -> list[str]: + """ + Convert an array of integers to a sequence of strings based on the alphabet. + + :param arr: array of integers + :param alphabet: list of strings representing the alphabet + :return: sequence of strings + :raises ValueError: if an integer in the array is not in the alphabet + """ + return [alphabet[i] for i in arr.tolist() if i < len(alphabet)] diff --git a/src/versalign/matrix.py b/src/versalign/matrix.py deleted file mode 100644 index 0f4d10b..0000000 --- a/src/versalign/matrix.py +++ /dev/null @@ -1,177 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Implementation of the scoring matrix for sequence alignment.""" - -import logging -import typing as ty - -import numpy as np - - -class Matrix: - """Base class for matrix objects.""" - - def __init__(self, nrows: int, ncols: int, fill: float) -> None: - """Initialize the matrix object. - - :param nrows: The number of rows in the matrix. - :type nrows: int - :param ncols: The number of columns in the matrix. - :type ncols: int - :param fill: The value to fill the matrix with. - :type fill: float - """ - self._nrows = nrows - self._ncols = ncols - self._matrix = self._create_matrix(fill) - - @property - def nrows(self) -> int: - """Return the number of rows in the matrix. - - :return: The number of rows in the matrix. - :rtype: int - """ - return self._nrows - - @property - def ncols(self) -> int: - """Return the number of columns in the matrix. - - :return: The number of columns in the matrix. - :rtype: int - """ - return self._ncols - - @property - def max_value(self) -> float: - """Return the maximum value in the matrix. - - :return: The maximum value in the matrix. - :rtype: float - """ - return np.max(self._matrix) - - @property - def min_value(self) -> float: - """Return the minimum value in the matrix. - - :return: The minimum value in the matrix. - :rtype: float - """ - return np.min(self._matrix) - - def _create_matrix(self, fill: float) -> np.ndarray: - """Create the matrix object with NumPy. - - :param fill: The value to fill the matrix with. - :type fill: float - :return: The matrix object. - :rtype: np.ndarray - :raises ValueError: If the fill value is not an integer or float. - """ - logger = logging.getLogger(__name__) - - if isinstance(fill, float): - return np.full((self._nrows, self._ncols), fill, dtype=np.float32) - - else: - msg = "Fill value must be a float." - logger.error(msg) - raise ValueError(msg) - - def transpose(self) -> np.ndarray: - """Transpose the matrix. - - :return: The transposed matrix. - :rtype: np.ndarray - """ - return self._matrix.T - - def set_value(self, row: int, col: int, value: ty.Union[int, float]) -> None: - """Set the value of a cell in the matrix. - - :param row: The row index. - :type row: int - :param col: The column index. - :type col: int - :param value: The value to set. - :type value: Union[int, float] - :raises ValueError: If the row or column index is out of bounds. - :raises ValueError: If the value is not an integer. - """ - logger = logging.getLogger(__name__) - - if not (0 <= row < self._nrows): - msg = "Row index out of bounds." - logger.error(msg) - raise ValueError(msg) - - if not (0 <= col < self._ncols): - msg = "Column index out of bounds." - logger.error(msg) - raise ValueError(msg) - - if ( - not isinstance(value, int) - and not isinstance(value, float) - and not isinstance(value, np.float32) - ): - msg = "Value must be an integer or float." - logger.error(msg) - raise ValueError(msg) - - if isinstance(value, int): - value = float(value) - - self._matrix[row, col] = value - - def get_value(self, row: int, col: int) -> float: - """Get the value of a cell in the matrix. - - :param row: The row index. - :type row: int - :param col: The column index. - :type col: int - :return: The value of the cell. - :rtype: float - :raises ValueError: If the row or column index is out of bounds. - """ - logger = logging.getLogger(__name__) - - if not (0 <= row < self._nrows): - msg = "Row index out of bounds." - logger.error(msg) - raise ValueError(msg) - - if not (0 <= col < self._ncols): - msg = "Column index out of bounds." - logger.error(msg) - raise ValueError(msg) - - return self._matrix[row, col] - - def normalize(self) -> None: - """Normalize the matrix so that all values are between 0 and 1.""" - self._matrix = (self._matrix - self.min_value) / (self.max_value - self.min_value) - - def to_distances(self) -> np.ndarray: - """Convert the matrix to a distance matrix. - - :return: The distance matrix. - :rtype: np.ndarray - """ - self.normalize() - return 1 - self._matrix - - -class AlignmentMatrix(Matrix): - """Class for alignment matrix objects.""" - - def alignment_score(self) -> float: - """Return the alignment score of the matrix. - - :return: The alignment score. - :rtype: float - """ - return self._matrix[-1, -1] diff --git a/src/versalign/motif.py b/src/versalign/motif.py deleted file mode 100644 index 9c4a925..0000000 --- a/src/versalign/motif.py +++ /dev/null @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Implementation of a sequence motif object.""" - -import typing as ty -from abc import ABC, abstractmethod - - -class Motif(ABC): - """Base class for sequence motif objects.""" - - def __init__(self, tag: ty.Optional[int] = None) -> None: - """Initialize the motif object. - - :param tag: The tag for the motif. - :type tag: ty.Optional[int] - """ - self._tag = tag - - def get_tag(self) -> ty.Optional[int]: - """Return the tag for the motif. - - :return: The tag for the motif. - :rtype: ty.Optional[int] - """ - return self._tag - - def set_tag(self, tag: int) -> None: - """Set the tag for the motif. - - :param tag: The tag for the motif. - :type tag: int - """ - self._tag = tag - - def clear_tag(self) -> None: - """Clear the tag for the motif.""" - self._tag = None - - @abstractmethod - def __eq__(self, other: ty.Any) -> bool: - """Compare the motif to another motif. - - :param other: The other motif to compare. - :type other: ty.Any - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - pass - - @abstractmethod - def __str__(self) -> str: - """Return the string representation of the motif. - - :return: The string representation of the motif. - :rtype: str - """ - pass - - -class Gap(Motif): - """Class for representing a gap in a sequence motif.""" - - def __init__(self) -> None: - """Initialize the gap object.""" - super().__init__() # Gap is initialized with tag=None. - - def __eq__(self, other: ty.Any) -> bool: - """Compare the gap to another motif. - - :param other: The other motif to compare. - :type other: Motif - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - return isinstance(other, Gap) - - def __str__(self) -> str: - """Return the string representation of the gap. - - :return: The string representation of the gap. - :rtype: str - """ - return "-" diff --git a/src/versalign/msa.py b/src/versalign/msa.py index f42e69a..74f6cd8 100644 --- a/src/versalign/msa.py +++ b/src/versalign/msa.py @@ -1,369 +1,112 @@ -# -*- coding: utf-8 -*- - -"""Implementation of multiple sequence alignment.""" +"""Multiple Sequence Alignment (MSA) module.""" import logging -import typing as ty - -from scipy.cluster.hierarchy import linkage -from scipy.spatial.distance import pdist +from collections.abc import Callable, Hashable -from .matrix import Matrix -from .motif import Gap, Motif -from .pairwise import PairwiseAlignment, align_pairwise, create_alignment_matrix_needelman_wunsch -from .sequence import Sequence +import numpy as np +from versalign.aligner import Aligner, PairwiseAligner +from versalign.config import DEFAULT_GAP_REPR, LOGGER_NAME +from versalign.helpers import arr_to_seq, seq_to_arr +from versalign.pairwise import pairwise_alignment, pairwise_alignment_score +from versalign.scoring import T -def pairwise_scoring_matrix( - seqs: ty.List[Sequence], - gap_penalty: int, - end_gap_penalty: int, - score_func: ty.Callable[[Motif, Motif], int], -) -> Matrix: - """Compute the pairwise similarity matrix of the given sequences. - :param seqs: List of sequences to compute the similarity matrix. - :type seqs: List[Sequence] - :param gap_penalty: Penalty for opening a gap. - :type gap_penalty: int - :param end_gap_penalty: Penalty for extending a gap. - :type end_gap_penalty: int - :param score_func: Function to score two motifs. - :type score_func: Callable[[Motif, Motif], int] - :return: Pairwise similarity matrix. - :rtype: Matrix - - The similarity matrix is computed using the global Needleman-Wunsch - algorithm. The similarity matrix is symmetric, so only the upper triangle is - computed. +def calc_msa( + aligner: Aligner, + seqs: list[list[str]], + gap_repr: str = DEFAULT_GAP_REPR, + center_star: int | None = None, +) -> tuple[list[list[str]], list[int]]: """ - num_seqs = len(seqs) - matrix = Matrix(num_seqs, num_seqs, fill=0.0) - - for i, seq_a in enumerate(seqs): - for j, seq_b in enumerate(seqs): - - # Skip if the similarity has already been computed. - if j < i: - continue - - score = create_alignment_matrix_needelman_wunsch( - seq_a=seq_a, - seq_b=seq_b, - gap_penalty=gap_penalty, - end_gap_penalty=end_gap_penalty, - score_func=score_func, - ).alignment_score() - - matrix.set_value(i, j, score) - matrix.set_value(j, i, score) - - return matrix - - -def merge_singles( - single1: Sequence, - single2: Sequence, - gap_penalty: int, - end_gap_penalty: int, - score_func: ty.Callable[[Motif, Motif], int], -) -> ty.List[Sequence]: - """Merge two single sequences into a single alignment. - - :param single1: First sequence to merge. - :type single1: Sequence - :param single2: Second sequence to merge. - :type single2: Sequence - :param gap_penalty: Penalty for opening a gap. - :type gap_penalty: int - :param end_gap_penalty: Penalty for extending a gap. - :type end_gap_penalty: int - :param score_func: Function to score two motifs. - :type score_func: Callable[[Motif, Motif], int] - :return: List with two aligned sequences. - :rtype: ty.List[Sequence] + Perform multiple sequence alignment using center star method. + + :param aligner: Aligner object + :param seqs: list of sequences to be aligned + :param gap_repr: gap representation, defaults to DEFAULT_GAP_REPR. Make sure + gap repr is in alphabet of substitution matrix + :param center_star: index of the center star sequence, defaults to None + :return: multiple sequence alignment and order of input sequences in alignment + :raises ValueError: if sequence list is empty """ - single1_aligned, single2_aligned, _ = align_pairwise( - seq_a=single1, - seq_b=single2, - score_func=score_func, - algorithm=PairwiseAlignment.NEEDLEMAN_WUNSCH, - options={"gap_penalty": gap_penalty, "end_gap_penalty": end_gap_penalty}, - ) - - # Clear all tags. - single1_aligned.clear_tags() - single2_aligned.clear_tags() - - return [single1_aligned, single2_aligned] - - -def merge_single_with_cluster( - single: Sequence, - cluster: ty.List[Sequence], - gap_penalty: int, - end_gap_penalty: int, - score_func: ty.Callable[[Motif, Motif], int], -) -> ty.List[Sequence]: - """Merge a single sequence with a cluster of sequences. - - :param single: Single sequence to merge. - :type single: Sequence - :param cluster: Cluster of sequences to merge with. - :type cluster: List[Sequence] - :param gap_penalty: Penalty for opening a gap. - :type gap_penalty: int - :param end_gap_penalty: Penalty for extending a gap. - :type end_gap_penalty: int - :param score_func: Function to score two motifs. - :type score_func: Callable[[Motif, Motif], int] - :return: Merged sequence. - :rtype: ty.List[Sequence] - """ - # Tag already aligned sequences with original location for insertion - # of possible gaps after annealing the new sequence. - first = cluster[0] - first.tag() - last = cluster[-1] - last.tag() - - # Align the leaf with the first and the last sequence in the cluster. - single_aligned_with_first, first_aligned, score_with_first = align_pairwise( - seq_a=single, - seq_b=first, - score_func=score_func, - algorithm=PairwiseAlignment.NEEDLEMAN_WUNSCH, - options={"gap_penalty": gap_penalty, "end_gap_penalty": end_gap_penalty}, - ) - single_aligned_with_last, last_aligned, score_with_last = align_pairwise( - seq_a=single, - seq_b=last, - score_func=score_func, - algorithm=PairwiseAlignment.NEEDLEMAN_WUNSCH, - options={"gap_penalty": gap_penalty, "end_gap_penalty": end_gap_penalty}, - ) - - if score_with_first >= score_with_last: - new, anchor = single_aligned_with_first, first_aligned - others = cluster[1:] - else: - new, anchor = single_aligned_with_last, last_aligned - others = cluster[:-1] - - for motif_idx, motif in enumerate(anchor): - if motif.get_tag() is None: # New insertion if tag is None. - for seq in others: - seq.insert(motif_idx, Gap()) - - # Clear all tags. - for seq in others: - seq.clear_tags() - anchor.clear_tags() - new.clear_tags() - - # Insert the new sequence into the cluster. - if score_with_first >= score_with_last: - new_cluster = [new, anchor] + others - else: - new_cluster = others + [anchor, new] - - return new_cluster - - -def merge_clusters( - cluster1: ty.List[Sequence], - cluster2: ty.List[Sequence], - gap_penalty: int, - end_gap_penalty: int, - score_func: ty.Callable[[Motif, Motif], int], -) -> ty.List[Sequence]: - """Merge two clusters of sequences. - - :param cluster1: First cluster of sequences to merge. - :type cluster1: List[Sequence] - :param cluster2: Second cluster of sequences to merge. - :type cluster2: List[Sequence] - :param gap_penalty: Penalty for opening a gap. - :type gap_penalty: int - :param end_gap_penalty: Penalty for extending a gap. - :type end_gap_penalty: int - :param score_func: Function to score two motifs. - :type score_func: Callable[[Motif, Motif], int] - :return: Merged sequence. - :rtype: ty.List[Sequence] - """ - # Tag already aligned sequences with original location for insertion - # of possible gaps after annealing the new sequence. - msa1_top, msa1_bottom = cluster1[0], cluster1[-1] - msa2_top, msa2_bottom = cluster2[0], cluster2[-1] - msa1_top.tag() - msa1_bottom.tag() - msa2_top.tag() - msa2_bottom.tag() - - # Align the top of msa1 with the bottom of msa2 and vice versa. - msa1_bottom_aligned_with_msa2_top, msa2_top_aligned_with_msa1_bottom, msa1_top_score = ( - align_pairwise( - seq_a=msa1_bottom, - seq_b=msa2_top, - score_func=score_func, - algorithm=PairwiseAlignment.NEEDLEMAN_WUNSCH, - options={"gap_penalty": gap_penalty, "end_gap_penalty": end_gap_penalty}, - ) - ) - msa1_top_aligned_to_msa2_bottom, msa2_bottom_aligned_to_msa1_top, msa2_top_score = ( - align_pairwise( - seq_a=msa1_top, - seq_b=msa2_bottom, - score_func=score_func, - algorithm=PairwiseAlignment.NEEDLEMAN_WUNSCH, - options={"gap_penalty": gap_penalty, "end_gap_penalty": end_gap_penalty}, - ) - ) - - if msa1_top_score >= msa2_top_score: - msa1_align_to, msa2_align_to = ( - msa1_bottom_aligned_with_msa2_top, - msa2_top_aligned_with_msa1_bottom, - ) - msa1_others = cluster1[:-1] - msa2_others = cluster2[1:] - else: - msa2_align_to, msa1_align_to = ( - msa1_top_aligned_to_msa2_bottom, - msa2_bottom_aligned_to_msa1_top, - ) - msa1_others = cluster1[1:] - msa2_others = cluster2[:-1] - - for motif_idx, motif in enumerate(msa1_align_to): - if motif.get_tag() is None: # New insertion if tag is None. - for seq in msa1_others: - seq.insert(motif_idx, Gap()) - - for motif_idx, motif in enumerate(msa2_align_to): - if motif.get_tag() is None: # New insertion if tag is None. - for seq in msa2_others: - seq.insert(motif_idx, Gap()) - - # Clear all tags. - for seq in msa1_others: - seq.clear_tags() - for seq in msa2_others: - seq.clear_tags() - msa1_align_to.clear_tags() - msa2_align_to.clear_tags() - - # Insert the new sequence into the cluster. - if msa1_top_score >= msa2_top_score: - new_cluster = msa1_others + [msa1_align_to, msa2_align_to] + msa2_others - else: - new_cluster = msa2_others + [msa2_align_to, msa1_align_to] + msa1_others - - return new_cluster - - -def multiple_sequence_alignment( - seqs: ty.List[Sequence], - gap_penalty: int, - end_gap_penalty: int, - score_func: ty.Callable[[Motif, Motif], int], -) -> ty.List[Sequence]: - """Perform multiple sequence alignment on the given sequences. - - :param seqs: List of sequences to align. - :type seqs: List[Sequence] - :param gap_penalty: Penalty for opening a gap. - :type gap_penalty: int - :param end_gap_penalty: Penalty for extending a gap. - :type end_gap_penalty: int - :param score_func: Function to score two motifs. - :type score_func: Callable[[Motif, Motif], int] - :return: List of aligned sequences. - :rtype: List[Sequence] - :raises ValueError: If the multiple sequence alignment is incomplete. - - Based on: 'Progressive Sequence Alignment as a Prerequisite to Correct - Phylogenetic Trees' by Feng and Doolittle, 1987 - """ - logger = logging.getLogger(__name__) + logger = logging.getLogger(LOGGER_NAME) if not seqs: - logger.error("No sequences to align.") - return [] - elif len(seqs) == 1: - logger.error("Only one sequence to align.") - return seqs + raise ValueError("the sequence list cannot be empty") + + aligner_obj: PairwiseAligner = aligner.aligner + label_fn: Callable[[T], Hashable] | None = aligner.label_fn + + # Set aligner mode to global + current_mode = aligner_obj.mode + if current_mode != "global": + logger.info(f"Overriding aligner mode from {current_mode} to 'global' for MSA") + aligner_obj.mode = "global" + + # Get int repr of gap + int_gap_repr = aligner_obj.substitution_matrix.names.index(gap_repr) + + # Convert sequences into int arrays based on substitution matrix alphabet + int_seqs = [] + for seq in seqs: + int_seqs.append(seq_to_arr(seq, aligner_obj.substitution_matrix.names, label_fn)) + + # Create pairwise similarity matrix + sims = np.zeros((len(int_seqs), len(int_seqs)), dtype=float) + for i, int_seq1 in enumerate(int_seqs): + for j, int_seq2 in enumerate(int_seqs): + # We don't register a score for i == j, because self-alignment shouldn't + # be considered for determining center star + if i >= j: + continue # only need to calcualte lower triangle + + score = pairwise_alignment_score(aligner_obj, int_seq1, int_seq2) + sims[i, j] = score + sims[j, i] = score + + # Find center star sequence, i.e., the sequence with the highest absolute similarity + if center_star is None: + center_i = int(np.argmax(np.sum(sims, axis=1))) else: - logger.debug("Computing pairwise similarity matrix...") - scores = pairwise_scoring_matrix(seqs, gap_penalty, end_gap_penalty, score_func) - scores.to_distances() - logger.debug("Computed pairwise similarity matrix.") - - # Identification of most closely related pairs. - guide_tree = linkage(pdist(scores._matrix), method="ward") - msa = {seq_idx: [seq] for seq_idx, seq in enumerate(seqs)} - - # Progressive insertion of neutral elements (can create new gaps, but cannot - # remove existing gaps). - for pair_idx, pair in enumerate(guide_tree): - - # Every pair in the guide tree can be a new pair that needs seed alignment - # or it is a leaf connecting to to an existing alignment. - j1, j2 = int(pair[0]), int(pair[1]) - new_idx = pair_idx + len(seqs) - - if len(msa[j1]) == 1 and len(msa[j2]) == 1: - seed1, seed2 = msa[j1][0], msa[j2][0] - seq1, seq2 = merge_singles(seed1, seed2, gap_penalty, end_gap_penalty, score_func) - msa[new_idx] = [seq1, seq2] - - elif len(msa[j1]) == 1 or len(msa[j2]) == 1: - - # One of the sequences is a leaf, the other is an aligned cluster. - if len(msa[j1]) == 1: - leaf, cluster = msa[j1][0], msa[j2] - else: - leaf, cluster = msa[j2][0], msa[j1] - - new_cluster = merge_single_with_cluster( - single=leaf, - cluster=cluster, - gap_penalty=gap_penalty, - end_gap_penalty=end_gap_penalty, - score_func=score_func, - ) - - # Update the MSA. - msa[new_idx] = new_cluster - - # Both items are already aligned cluste: len(msa[j1]) > 1 and len(msa[j2]) > 1 + center_i = center_star + + # Mask out self‑score so we can sort without special slicing + sims[center_i, center_i] = -np.inf + + # Sort all by descending similarity, drop the center itself + all_idx = sims[center_i].argsort()[::-1] + others = [i for i in all_idx if i != center_i] + + # Align sequences to center star sequence + msa = np.array([int_seqs[center_i]], dtype=np.int32) + for item in others: + # Align sequence to center star sequence + target = msa[0] + query = int_seqs[item] + insert_repr = np.int32(-1) + _, t_a, q_a = pairwise_alignment(aligner_obj, target, query, gap_repr=insert_repr) + + # Insert gaps in remainder sequences, if any + remainder = msa[1:] + if remainder.shape[0] > 0: + insert_indices = [i for i, x in enumerate(t_a) if x == insert_repr] + for i in insert_indices: + remainder = np.insert(remainder, i, insert_repr, axis=1) + msa = np.vstack([t_a, remainder, q_a]) else: - # First we need to decide which MSA comes on top. To determine this - # we need to score the top of msa1 with the bottom of msa2 and vice - # versa. - cluster1, cluster2 = msa[j1], msa[j2] + msa = np.vstack([t_a, q_a]) - new_cluster = merge_clusters( - cluster1=cluster1, - cluster2=cluster2, - gap_penalty=gap_penalty, - end_gap_penalty=end_gap_penalty, - score_func=score_func, - ) + # Rename insert_repr to gap_repr + msa[msa == insert_repr] = int_gap_repr - # Update the MSA. - msa[new_idx] = new_cluster + # Check if there are any gap-only columns, delete those if any + gap_columns = np.where(np.all(msa == int_gap_repr, axis=0))[0] + if gap_columns.size > 0: + msa = np.delete(msa, gap_columns, axis=1) - del msa[j1] - del msa[j2] + # Convert msa to list of list of strings + str_msa = [arr_to_seq(row, aligner_obj.substitution_matrix.names) for row in msa] - # Retrieve final MSA. - clusters = list(msa.keys()) + final_order = [center_i] + others - if len(clusters) == 1: - return msa[clusters[0]] - else: - msg = f"MSA incomplete. Found {len(clusters)} unaligned clusters." - logger.error(msg) - raise ValueError(msg) + return str_msa, final_order diff --git a/src/versalign/pairwise.py b/src/versalign/pairwise.py index 1ac5a59..1fb5f0b 100644 --- a/src/versalign/pairwise.py +++ b/src/versalign/pairwise.py @@ -1,515 +1,115 @@ -# -*- coding: utf-8 -*- +"""Pairwise sequence alignment module.""" -"""Implementation of pairwise alignment between two sequences.""" - -import logging -import typing as ty -from enum import Enum, IntEnum, auto +from collections.abc import Callable, Hashable import numpy as np +from numpy.typing import NDArray -from .matrix import AlignmentMatrix -from .motif import Gap, Motif -from .sequence import Sequence - - -class PairwiseAlignment(Enum): - """Enum for the type of pairwise alignment.""" - - NEEDLEMAN_WUNSCH = auto() # Needleman-Wunsch for global alignment. - SMITH_WATERMAN = auto() # Smith-Waterman for local alignment. - +from versalign.aligner import Aligner, PairwiseAligner +from versalign.config import DEFAULT_GAP_REPR +from versalign.helpers import arr_to_seq, seq_to_arr +from versalign.scoring import T -def create_alignment_matrix_needelman_wunsch( - seq_a: Sequence, - seq_b: Sequence, - gap_penalty: int, - end_gap_penalty: int, - score_func: ty.Callable[[Motif, Motif], int], -) -> AlignmentMatrix: - """Create the alignment matrix for the two sequences. - :param seq_a: The first sequence to align. - :type seq_a: Sequence - :param seq_b: The second sequence to align. - :type seq_b: Sequence - :param gap_penalty: The penalty for inserting a gap. - :type gap_penalty: int - :param end_gap_penalty: The penalty for inserting a gap at the end of a sequence. - :type end_gap_penalty: int - :param score_func: The function to score a pair of motifs. - :type score_func: ty.Callable[[Motif, Motif], int] - :return: The alignment matrix for the two sequences. - :rtype: AlignmentMatrix - :raises ValueError: If the sequences are not Sequence objects. - :raises ValueError: If the gap penalties are not integers. - :raises ValueError: If the score function is not callable. +def pairwise_alignment_score( + aligner: PairwiseAligner, + target: NDArray[np.int32], + query: NDArray[np.int32], +) -> float: """ - logger = logging.getLogger(__name__) - logger.debug("Creating alignment matrix...") - - # Check if seq_a is Sequence and seq_b is Sequence. - if not isinstance(seq_a, Sequence) or not isinstance(seq_b, Sequence): - msg = "seq_a and seq_b must be Sequence objects." - logger.error(msg) - raise ValueError(msg) - - # Check if gap_penalty is int and end_gap_penalty is int. - if not isinstance(gap_penalty, int) or not isinstance(end_gap_penalty, int): - msg = "gap_penalty and end_gap_penalty must be integers." - logger.error(msg) - raise ValueError(msg) - - # Check if score_func is callable. - if not callable(score_func): - msg = "score_func must be callable." - logger.error(msg) - raise ValueError(msg) - - # Instatiate the alignment matrix. - nrows = len(seq_a) + 1 - ncols = len(seq_b) + 1 - matrix = AlignmentMatrix(nrows, ncols, fill=0.0) - - # Fill in initial alignment for when there is zero alignment. - for ri in range(nrows): - matrix.set_value(ri, 0, ri * end_gap_penalty) + Align two sequences and return the alignment score. - for ci in range(ncols): - matrix.set_value(0, ci, ci * end_gap_penalty) - - # Fill in the rest of the alignment matrix. - for ci in range(1, ncols): - for ri in range(1, nrows): - - # Calculate the pairwise score. - score = score_func(seq_a[ri - 1], seq_b[ci - 1]) - - # Calculate the penalty for inserting a gap. - if ci == len(seq_b) or ri == len(seq_a): - penalty = end_gap_penalty - else: - penalty = gap_penalty - - # Calculate the alignment score. - matrix.set_value( - row=ri, - col=ci, - value=max( - matrix.get_value(ri - 1, ci) - penalty, # Insert gap in seq_a. - matrix.get_value(ri, ci - 1) - penalty, # Insert gap in seq_b. - matrix.get_value(ri - 1, ci - 1) + score, # Match or mismatch. - ), - ) - - logger.debug("Alignment matrix created.") - return matrix - - -def traceback_alignment_needelman_wunsch( - matrix: AlignmentMatrix, - seq_a: Sequence, - seq_b: Sequence, - gap_penalty: int, - end_gap_penalty: int, -) -> ty.List[ty.Tuple[Motif, Motif]]: - """Traceback the alignment matrix to find the optimal alignment. - - :param matrix: The alignment matrix. - :type matrix: AlignmentMatrix - :param seq_a: The first sequence to align. - :type seq_a: Sequence - :param seq_b: The second sequence to align. - :type seq_b: Sequence - :param gap_penalty: The penalty for inserting a gap. - :type gap_penalty: int - :param end_gap_penalty: The penalty for inserting a gap at the end of a sequence. - :type end_gap_penalty: int - :return: The optimal alignment of the two sequences. - :rtype: ty.List[ty.Tuple[Motif, Motif]] - :raises ValueError: If the sequences and matrix are not compatible. - :raises ValueError: If the sequences are not Sequence objects. - :raises ValueError: If the gap penalties are not integers. - :raises ValueError: If the matrix is not a Matrix object. - :raises ValueError: If the traceback is invalid. + :param aligner: PairwiseAligner object + :param target: target sequence + :param query: query sequence + :return: alignment score """ + return aligner.score(target, query) - def traceback(ri: int, ci: int) -> ty.List[ty.Tuple[Motif, Motif]]: - """Traceback the alignment matrix recursively. - - :param ri: The row index. - :type ri: int - :param ci: The column index. - :type ci: int - :return: The optimal alignment of the two sequences. - :rtype: ty.List[ty.Tuple[Motif, Motif]] - :raises ValueError: If the traceback is invalid. - """ - # End of the traceback. - if ri == 0 and ci == 0: - logger.debug("End of traceback.") - return [] - - if ri == 0: - return traceback(ri, ci - 1) + [(Gap(), seq_b[ci - 1])] - if ci == 0: - return traceback(ri - 1, ci) + [(seq_a[ri - 1], Gap())] - - # Gap penalty. - if ri == len(seq_a) or ci == len(seq_b): - penalty = end_gap_penalty - else: - penalty = gap_penalty - - # Calculate score of current cell and possible moves. - current = matrix.get_value(ri, ci) - horizontal = matrix.get_value(ri, ci - 1) - diagonal = matrix.get_value(ri - 1, ci - 1) - vertical = matrix.get_value(ri - 1, ci) - - # Traceback for match. - if ( - seq_a[ri - 1] == seq_b[ci - 1] - or diagonal > max(horizontal, vertical) - or ((vertical - current) != penalty and (horizontal - current) != penalty) - ): - return traceback(ri - 1, ci - 1) + [(seq_a[ri - 1], seq_b[ci - 1])] - - # Traceback for gap. - if horizontal > vertical: - return traceback(ri, ci - 1) + [(Gap(), seq_b[ci - 1])] - - if vertical >= horizontal: - return traceback(ri - 1, ci) + [(seq_a[ri - 1], Gap())] - - raise ValueError("Invalid traceback.") - - logger = logging.getLogger(__name__) - logger.debug("Tracing back alignment...") - - # Check if matrix is a Matrix object. - if not isinstance(matrix, AlignmentMatrix): - msg = "matrix must be a Matrix object." - logger.error(msg) - raise ValueError(msg) - - # Check if seq_a is Sequence and seq_b is Sequence. - if not isinstance(seq_a, Sequence) or not isinstance(seq_b, Sequence): - msg = "seq_a and seq_b must be Sequence objects." - logger.error(msg) - raise ValueError(msg) - - # Check if gap_penalty is int and end_gap_penalty is int. - if not isinstance(gap_penalty, int) or not isinstance(end_gap_penalty, int): - msg = "gap_penalty and end_gap_penalty must be integers." - logger.error(msg) - raise ValueError(msg) - - # Check if seq_a and seq_b could have created the matrix. - if len(seq_a) + 1 != matrix.nrows or len(seq_b) + 1 != matrix.ncols: - msg = "The sequences and matrix are not compatible." - logger.error(msg) - raise ValueError(msg) - - return traceback(ri=matrix.nrows - 1, ci=matrix.ncols - 1) - - -def needleman_wunsch( - seq_a: Sequence, - seq_b: Sequence, - score_func: ty.Callable[[Motif, Motif], int], - gap_penalty: int = 2, - end_gap_penalty: int = 1, -) -> ty.Tuple[Sequence, Sequence, float]: - """Align two sequences using Needleman-Wunsch algorithm. - - :param seq_a: The first sequence to align. - :type seq_a: Sequence - :param seq_b: The second sequence to align. - :type seq_b: Sequence - :param score_func: The function to score a pair of motifs. - :type score_func: ty.Callable[[Motif, Motif], int] - :param gap_penalty: The penalty for inserting a gap. Default is 2. - :type gap_penalty: int - :param end_gap_penalty: The penalty for inserting a gap at the end of a sequence. - Default is 1. - :type end_gap_penalty: int - :return: The optimal alignment of the two sequences and the alignment score. - :rtype: ty.Tuple[Sequence, Sequence, float] - :raises ValueError: If the sequences are not Sequence objects. - :raises ValueError: If the gap penalties are not integers. - :raises ValueError: If the score function is not callable. +def pairwise_alignment( + aligner: PairwiseAligner, + target: NDArray[np.int32], + query: NDArray[np.int32], + gap_repr: np.int32 | None = None, +) -> tuple[float, NDArray[np.int32], NDArray[np.int32]]: """ - logger = logging.getLogger(__name__) - logger.debug("Aligning sequences using Needleman-Wunsch algorithm...") - - # Check if seq_a is Sequence and seq_b is Sequence. - if not isinstance(seq_a, Sequence) or not isinstance(seq_b, Sequence): - msg = "seq_a and seq_b must be Sequence objects." - logger.error(msg) - raise ValueError(msg) - - # Check if gap_penalty is int and end_gap_penalty is int. - if not isinstance(gap_penalty, int) or not isinstance(end_gap_penalty, int): - msg = "gap_penalty and end_gap_penalty must be integers." - logger.error(msg) - raise ValueError(msg) - - # Check if score_func is callable. - if not callable(score_func): - msg = "score_func must be callable." - logger.error(msg) - raise ValueError(msg) - - # Create the alignment matrix. - matrix = create_alignment_matrix_needelman_wunsch( - seq_a=seq_a, - seq_b=seq_b, - gap_penalty=gap_penalty, - end_gap_penalty=end_gap_penalty, - score_func=score_func, - ) - - alignment_score = matrix.alignment_score() - - # Traceback the alignment matrix. - aligned = traceback_alignment_needelman_wunsch( - matrix=matrix, - seq_a=seq_a, - seq_b=seq_b, - gap_penalty=gap_penalty, - end_gap_penalty=end_gap_penalty, - ) - - seq_a_aligned = Sequence(seq_a.identifier, [motif_a for motif_a, _ in aligned]) - seq_b_aligned = Sequence(seq_b.identifier, [motif_b for _, motif_b in aligned]) - - logger.debug("Sequences aligned.") - return seq_a_aligned, seq_b_aligned, alignment_score - - -class SmithWatermanDirection(IntEnum): - """Enum for the direction of the Smith-Waterman algorithm.""" - - STOP = 0 - DIAGONAL = 1 - VERTICAL = 2 - HORIZONTAL = 3 - - -def smith_waterman( - seq_a: Sequence, - seq_b: Sequence, - score_func: ty.Callable[[Motif, Motif], int], - gap_penalty: int = 2, -) -> ty.Tuple[Sequence, Sequence, float]: - """Create the alignment matrix for the two sequences. - - :param seq_a: The first sequence to align. - :type seq_a: Sequence - :param seq_b: The second sequence to align. - :type seq_b: Sequence - :param score_func: The function to score a pair of motifs. - :type score_func: ty.Callable[[Motif, Motif], int] - :param gap_penalty: The penalty for inserting a gap. Default is 2. - :type gap_penalty: int - :return: The aligned sequences and the alignment score. - :rtype: ty.Tuple[Sequence, Sequence, float] - :raises ValueError: If the sequences are not Sequence objects. - :raises ValueError: If the gap penalty is not an integer. - :raises ValueError: If the score function is not callable. + Align two sequences and return the alignment score. + + :param aligner: PairwiseAligner object + :param target: target sequence + :param query: query sequence + :param gap_repr: integer representation of gap, defaults to 0 + :return: alignment score or alignment as a tuple of two arrays + :raises ValueError: if cigar string has unexpected format """ - logger = logging.getLogger(__name__) - logger.debug("Aligning sequences using Smith-Waterman algorithm...") - - # Check if seq_a is Sequence and seq_b is Sequence. - if not isinstance(seq_a, Sequence) or not isinstance(seq_b, Sequence): - msg = "seq_a and seq_b must be Sequence objects." - logger.error(msg) - raise ValueError(msg) - - # Check if gap_penalty is int. - if not isinstance(gap_penalty, int): - msg = "gap_penalty must be an integer." - logger.error(msg) - raise ValueError(msg) - - # Check if score_func is callable. - if not callable(score_func): - msg = "score_func must be callable." - logger.error(msg) - raise ValueError(msg) - - # Instatiate the alignment matrix. - nrows = len(seq_a) + 1 - ncols = len(seq_b) + 1 - matrix = AlignmentMatrix(nrows, ncols, fill=0.0) - - # Initialize tracing matrix. - tracing_matrix = np.zeros(shape=(nrows, ncols), dtype=int) - - # Initialize maximum score and its position. - max_score: ty.Optional[float] = None - max_index: ty.Optional[ty.Tuple[int, int]] = None - - logger.debug("Creating alignment matrix...") - - # Fill in the alignment matrix. - for ri in range(1, nrows): - for ci in range(1, ncols): - - # Calculate score of current cell and possible moves. - diagonal = matrix.get_value(ri - 1, ci - 1) + score_func(seq_a[ri - 1], seq_b[ci - 1]) - vertical = matrix.get_value(ri - 1, ci) - gap_penalty - horizontal = matrix.get_value(ri, ci - 1) - gap_penalty - - directions = [ - (SmithWatermanDirection.STOP, 0), - (SmithWatermanDirection.DIAGONAL, diagonal), - (SmithWatermanDirection.VERTICAL, vertical), - (SmithWatermanDirection.HORIZONTAL, horizontal), - ] - - # Calculate the alignment score. - highest_score_index = np.argmax([score for _, score in directions]) - highest_score_direction = directions[highest_score_index][0] - highest_score_value = directions[highest_score_index][1] - matrix.set_value(row=ri, col=ci, value=highest_score_value) - tracing_matrix[ri, ci] = highest_score_direction - - # Update maximum score and its position. - if max_score is None or highest_score_value > max_score: - max_score = highest_score_value - max_index = (ri, ci) - - if max_score is None or max_index is None: - msg = "Maximum score and its position not found." - logger.error(msg) - raise ValueError(msg) - - logger.debug("Alignment matrix created.") - - # Traceback the alignment matrix. - aligned_a: ty.List[Motif] = [] - aligned_b: ty.List[Motif] = [] - current_aligned_a: ty.Optional[Motif] = None - current_aligned_b: ty.Optional[Motif] = None - - logger.debug("Tracing back alignment...") - ri, ci = max_index - - while tracing_matrix[ri, ci] != SmithWatermanDirection.STOP: - if tracing_matrix[ri, ci] == SmithWatermanDirection.DIAGONAL: - current_aligned_a = seq_a[ri - 1] - current_aligned_b = seq_b[ci - 1] - ri -= 1 - ci -= 1 - - elif tracing_matrix[ri, ci] == SmithWatermanDirection.VERTICAL: - current_aligned_a = seq_a[ri - 1] - current_aligned_b = Gap() - ri -= 1 - - elif tracing_matrix[ri, ci] == SmithWatermanDirection.HORIZONTAL: - current_aligned_a = Gap() - current_aligned_b = seq_b[ci - 1] - ci -= 1 - + if gap_repr is None: + gap_repr = np.int32(0) + + alignments = aligner.align(seqA=target, seqB=query) + + # Pick first alignment + alignment = alignments[0] + score = alignments[0].score + + t_a: list[np.int32] = [] + q_a: list[np.int32] = [] + for i in range(alignment.coordinates.shape[1] - 1): + a = alignment.coordinates[0][i : i + 2] + b = alignment.coordinates[1][i : i + 2] + len_a = a[1] - a[0] + len_b = b[1] - b[0] + if len_a == len_b: + t_a.extend(target[a[0] : a[1]]) + q_a.extend(query[b[0] : b[1]]) + elif len_a == 0: + t_a.extend([gap_repr] * len_b) + q_a.extend(query[b[0] : b[1]]) + elif len_b == 0: + t_a.extend(target[a[0] : a[1]]) + q_a.extend([gap_repr] * len_a) else: - msg = "Invalid traceback. Should be diagonal, vertical, or horizontal." - logger.error(msg) - raise ValueError(msg) - - if current_aligned_a is not None and current_aligned_b is not None: - aligned_a = aligned_a + [current_aligned_a] - aligned_b = aligned_b + [current_aligned_b] - else: - msg = "Invalid traceback. Found motif as None." - logger.error(msg) - raise ValueError(msg) - - logger.debug("Alignment traced back.") - - # Reverse the aligned sequences. - aligned_a = aligned_a[::-1] - aligned_b = aligned_b[::-1] - - logger.debug("Sequences aligned.") + raise ValueError("unexpected alignment") return ( - Sequence(seq_a.identifier, aligned_a), - Sequence(seq_b.identifier, aligned_b), - max_score, + score, + np.array(t_a, dtype=np.int32), + np.array(q_a, dtype=np.int32), ) -def align_pairwise( - seq_a: Sequence, - seq_b: Sequence, - score_func: ty.Callable[[Motif, Motif], int], - algorithm: PairwiseAlignment = PairwiseAlignment.NEEDLEMAN_WUNSCH, - options: ty.Optional[ty.Dict[str, ty.Any]] = None, -) -> ty.Tuple[Sequence, Sequence, float]: - """Align two sequences using pairwise alignment. - - :param seq_a: The first sequence to align. - :type seq_a: Sequence - :param seq_b: The second sequence to align. - :type seq_b: Sequence - :param score_func: The function to score a pair of motifs. - :type score_func: ty.Callable[[Motif, Motif], int] - :param algorithm: The algorithm to use for pairwise alignment. - :type algorithm: PairwiseAlignment - :param options: Additional keyword arguments for the algorithm. - :type options: ty.Optional[ty.Dict[str, ty.Any]] - :return: The optimal alignment of the two sequences and the alignment score. - :rtype: ty.Tuple[Sequence, Sequence, float] - :raises ValueError: If the sequences are not Sequence objects. - :raises ValueError: If the gap penalties are not integers. - :raises ValueError: If the score function is not callable. - :raises ValueError: If the algorithm is not PairwiseAlignment. - :raises NotImplementedError: If the algorithm is not implemented. +def calc_pairwise_alignment( + aligner: Aligner, + target: list[str], + query: list[str], + gap_repr: str = DEFAULT_GAP_REPR, +) -> tuple[float, list[str], list[str]]: """ - logger = logging.getLogger(__name__) - - # Check if seq_a is Sequence and seq_b is Sequence. - if not isinstance(seq_a, Sequence) or not isinstance(seq_b, Sequence): - msg = "seq_a and seq_b must be Sequence objects." - logger.error(msg) - raise ValueError(msg) + Align two sequences and return the alignment score or alignment. + + :param aligner: Aligner object + :param target: target sequence + :param query: query sequence + :param gap_repr: gap representation, defaults to DEFAULT_GAP_REPR. Make sure + gap repr is in alphabet of substitution matrix + :return: alignment score or alignment as a tuple of two sequences + """ + aligner_obj: PairwiseAligner = aligner.aligner + label_fn: Callable[[T], Hashable] | None = aligner.label_fn - # Check if score_func is callable. - if not callable(score_func): - msg = "score_func must be callable." - logger.error(msg) - raise ValueError(msg) + # Get int repr of gap; check if is in substitution matrix names + if gap_repr not in aligner_obj.substitution_matrix.names: + raise ValueError("'gap_repr' must be in substitution matrix alphabet") + int_gap_repr = aligner_obj.substitution_matrix.names.index(gap_repr) - # Check if algorithm is PairwiseAlignment. - if not isinstance(algorithm, PairwiseAlignment): - msg = "algorithm must be PairwiseAlignment." - logger.error(msg) - raise ValueError(msg) + # Convert sequences into int arrays based on substitution matrix alphabet + int_target = seq_to_arr(target, aligner_obj.substitution_matrix.names, label_fn) + int_query = seq_to_arr(query, aligner_obj.substitution_matrix.names, label_fn) - if algorithm == PairwiseAlignment.NEEDLEMAN_WUNSCH: - seq_a_aligned, seq_b_aligned, alignment_score = needleman_wunsch( - seq_a=seq_a, - seq_b=seq_b, - score_func=score_func, - **(options or {}), - ) - return seq_a_aligned, seq_b_aligned, alignment_score + score, t_a, q_a = pairwise_alignment(aligner_obj, int_target, int_query, gap_repr=np.int32(int_gap_repr)) - elif algorithm == PairwiseAlignment.SMITH_WATERMAN: - seq_a_aligned, seq_b_aligned, alignment_score = smith_waterman( - seq_a=seq_a, - seq_b=seq_b, - score_func=score_func, - **(options or {}), - ) - return seq_a_aligned, seq_b_aligned, alignment_score + # Convert aligned sequences back to list of strings + str_t_a = arr_to_seq(t_a, aligner_obj.substitution_matrix.names) + str_q_a = arr_to_seq(q_a, aligner_obj.substitution_matrix.names) - else: - msg = f"Pairwise alignment algorithm {algorithm} is not implemented." - logger.error(msg) - raise NotImplementedError(msg) + return score, str_t_a, str_q_a diff --git a/src/versalign/printing.py b/src/versalign/printing.py new file mode 100644 index 0000000..9bbf9d9 --- /dev/null +++ b/src/versalign/printing.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +"""Pretty printing for alignment results.""" + +from collections.abc import Sequence + + +def format_alignment( + aligned_seqs: Sequence[Sequence[object]], + *, + names: Sequence[str] | None = None, + score: float | None = None, + gap_repr: str = "-", + block_cols: int | None = None, # e.g. 60 to wrap long alignments + show_consensus: bool = True, +) -> str: + """ + Pretty-format aligned sequences with per-column width so multi-char tokens align. + + :param aligned_seqs: list of aligned sequences (each a list/tuple of tokens) + :param names: optional sequence names + :param score: optional alignment score to print at the top + :param gap_repr: string used to represent a gap when computing consensus (defaults to "-") + :param block_cols: if given, wrap output into blocks of this many columns + :param show_consensus: if True, include a simple consensus line + :returns: formatted alignment string + """ + if not aligned_seqs: + return "(no sequences)" + + nseq = len(aligned_seqs) + lengths = [len(s) for s in aligned_seqs] + if len(set(lengths)) != 1: + raise ValueError(f"All aligned sequences must have same length; got {lengths}") + L = lengths[0] + + # Coerce everything to strings once + seqs_str: list[list[str]] = [[str(tok) for tok in seq] for seq in aligned_seqs] + + # Default names if not provided + if names is None: + names = [f"seq{i + 1}" for i in range(nseq)] + if len(names) != nseq: + raise ValueError("names length must match number of sequences") + + # Compute per-column widths based on the widest token at that column + col_w = [0] * L + for j in range(L): + col_w[j] = max(len(seqs_str[i][j]) for i in range(nseq)) + + # Left margin width for names + name_w = max(len(n) for n in names) if names else 0 + + def consensus_char(j: int) -> str: + """Return '|' if all non-gap tokens equal (and at least one non-gap), else ' '.""" + tokens = [seqs_str[i][j] for i in range(nseq)] + nongap = [t for t in tokens if t != gap_repr] + if not nongap: + return " " + # If there is at least one gap, or not all equal, return ' ' + if len(nongap) < len(tokens): + return " " + return "|" if len(set(nongap)) == 1 else " " + + def render_block(col_start: int, col_end: int) -> list[str]: + # Build lines for each sequence + lines: list[str] = [] + for i in range(nseq): + left = (names[i].rjust(name_w) + " > ") if name_w > 0 else "" + cells = [ + seqs_str[i][j].ljust(col_w[j]) # left-pad tokens so columns align + for j in range(col_start, col_end) + ] + lines.append(left + " ".join(cells)) + if show_consensus: + left = (" " * name_w + " ") if name_w > 0 else "" + c = [consensus_char(j) for j in range(col_start, col_end)] + # Consensus is one char per column; widen to column width by padding right + c_cells = [c[j - col_start].ljust(col_w[j]) for j in range(col_start, col_end)] + lines.append(left + " ".join(c_cells)) + return lines + + # Wrap into blocks if requested + blocks: list[str] = [] + if block_cols and block_cols > 0: + for start in range(0, L, block_cols): + end = min(start + block_cols, L) + blocks.extend(render_block(start, end)) + if end < L: + blocks.append("") # blank line between blocks + else: + blocks.extend(render_block(0, L)) + + header = [] + if score is not None: + header.append(str(score)) + + return "\n".join(header + blocks) diff --git a/src/versalign/scoring.py b/src/versalign/scoring.py new file mode 100644 index 0000000..ab5cb97 --- /dev/null +++ b/src/versalign/scoring.py @@ -0,0 +1,142 @@ +"""Scoring module for sequence alignment.""" + +from collections.abc import Callable, Hashable, Sequence +from typing import Any, TypeVar + +import numpy as np +import pandas as pd + +from versalign.aligner import substitution_matrices + +T = TypeVar("T") + + +def default_compare(a: Any, b: Any) -> float: + """ + Default comparison function for scoring. + + :param a: first element to compare + :param b: second element to compare + :return: 1.0 if elements are equal, else 0.0 + """ + return 1.0 if a == b else 0.0 + + +def ensure_hashable_unique(labels: Sequence[Hashable]) -> None: + """ + Ensure that all labels are hashable and unique. + + :param labels: sequence of labels to check + :raises TypeError: if any label is not hashable + :raises ValueError: if any label is not unique + """ + # Hashability check; will raise TypeError if not hashable + for lbl in labels: + hash(lbl) + + # Uniqueness check; will raise ValueError if not unique + if len(set(labels)) != len(labels): + dupes = [x for x in labels if labels.count(x) > 1] + raise ValueError(f"Labels must be unique; duplicates found: {sorted(set(dupes))}") + + +def labels_from_objs(objs: Sequence[T], label_fn: Callable[[T], Hashable] | None) -> tuple[Hashable, ...]: + """ + Generate labels from a sequence of objects. + + :param objs: sequence of objects to generate labels from + :param label_fn: function to generate a label from an object; if None, use + the object itself or its string representation + :return: tuple of labels + :raises TypeError: if any label is not hashable + :raises ValueError: if any label is not unique + """ + if label_fn is None: + # Use objects themselves as labels if possible; fall back to str(...) + raw = [] + for o in objs: + try: + hash(o) + raw.append(o) + except TypeError: + raw.append(str(o)) + labels: tuple[Hashable, ...] = tuple(raw) + else: + labels = tuple(label_fn(o) for o in objs) + + # Validate labels + ensure_hashable_unique(labels) + + return labels + + +def create_substitution_matrix(df: pd.DataFrame) -> tuple[substitution_matrices.Array, tuple[str, ...]]: + """ + Parse a substitution matrix from a DataFrame. + + :param df: parsed data from the input file containing the substitution matrix + :return: tuple of substitution matrix and alphabet + :raises ValueError: if the substitution matrix is not square + """ + # Check if DataFrame is square + if df.shape[0] != df.shape[1]: + raise ValueError("Substitution matrix must be square") + + alphabet = tuple(df.columns) + data = df.to_numpy(np.float64) + + # Create substitution matrix + sm = substitution_matrices.Array(alphabet, 2, data, np.float64) + sm.names = alphabet + + return sm, alphabet + + +def create_substituion_matrix_dynamically( + objs: Sequence[T], + compare: Callable[[T, T], float] | None = None, + *, + label_fn: Callable[[T], Hashable] | None = None, + dtype: type = float, + symmetric: bool = True, +) -> tuple[pd.DataFrame, tuple[str, ...]]: + """ + Create a substitution matrix dynamically. + + :param objs: sequence of objects to create the substitution matrix from + :param compare: function to compare two objects and return a score; if None, + use the default comparison function + :param label_fn: function to generate a label from an object; if None, use + the object itself or its string representation + :param dtype: data type for the substitution matrix + :param symmetric: whether the substitution matrix is symmetric + :return: substitution matrix as a DataFrame + :raises ValueError: if the input sequence is empty + """ + if not objs: + raise ValueError("Cannot create substitution matrix from empty sequence") + + cmp_fn = compare or default_compare + labels = labels_from_objs(objs, label_fn) + n = len(objs) + + data = np.empty((n, n), dtype=dtype) + + # Fill in data + if symmetric: + for i in range(n): + # Diagonal first; often faster to set explicitly + data[i, i] = cmp_fn(objs[i], objs[i]) + for j in range(i + 1, n): + s = cmp_fn(objs[i], objs[j]) + data[i, j] = s + data[j, i] = s + else: + for i in range(n): + for j in range(n): + data[i, j] = cmp_fn(objs[i], objs[j]) + + # Create substitution matrix + df = pd.DataFrame(data, index=labels, columns=labels) + + return create_substitution_matrix(df) diff --git a/src/versalign/sequence.py b/src/versalign/sequence.py deleted file mode 100644 index 590ad9f..0000000 --- a/src/versalign/sequence.py +++ /dev/null @@ -1,124 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Implementation of sequence object comprising a list of motifs.""" - -import logging -import typing as ty - -from .motif import Motif - - -class Sequence: - """Class for representing a sequence object.""" - - def __init__(self, identifier: str, motifs: ty.Optional[ty.List[Motif]] = None) -> None: - """Initialize the sequence object. - - :param identifier: The identifier of the sequence. - :type identifier: str - :param motifs: The list of motifs in the sequence. - :type motifs: ty.Optional[ty.List[Motif]] - :raises ValueError: If any element in the list is not a motif. - """ - logger = logging.getLogger(__name__) - - self._identifier = identifier - - if motifs is None: - motifs = [] - - if not all(isinstance(motif, Motif) for motif in motifs): - msg = "All elements in the list must be motifs." - logger.error(msg) - raise ValueError(msg) - - self._motifs = motifs - - def __len__(self) -> int: - """Return the length of the sequence. - - :return: The length of the sequence. - :rtype: int - """ - return len(self._motifs) - - def __getitem__(self, index: int) -> Motif: - """Return the motif at the specified index. - - :param index: The index of the motif to return. - :type index: int - :return: The motif at the specified index. - :rtype: Motif - """ - return self._motifs[index] - - def __setitem__(self, index: int, motif: Motif) -> None: - """Set the motif at the specified index. - - :param index: The index of the motif to set. - :type index: int - :param motif: The motif to set. - :type motif: Motif - :raises ValueError: If the motif is not a motif object. - """ - logger = logging.getLogger(__name__) - - if not isinstance(motif, Motif): - msg = "The motif must be a motif object." - logger.error(msg) - raise ValueError(msg) - - self._motifs[index] = motif - - def __iter__(self) -> ty.Iterator[Motif]: - """Return an iterator over the motifs in the sequence. - - :return: An iterator over the motifs. - :rtype: ty.Iterator[Motif] - """ - return iter(self._motifs) - - def __str__(self) -> str: - """Convert the sequence to a string representation. - - :return: The string representation of the sequence. - :rtype: str - """ - return "".join(str(motif) for motif in self._motifs) - - @property - def identifier(self) -> str: - """Return the identifier of the sequence. - - :return: The identifier of the sequence. - :rtype: str - """ - return self._identifier - - def insert(self, index: int, motif: Motif) -> None: - """Insert the motif at the specified index. - - :param index: The index to insert the motif. - :type index: int - :param motif: The motif to insert. - :type motif: Motif - :raises ValueError: If the motif is not a motif object. - """ - logger = logging.getLogger(__name__) - - if not isinstance(motif, Motif): - msg = "The motif must be a motif object." - logger.error(msg) - raise ValueError(msg) - - self._motifs.insert(index, motif) - - def tag(self) -> None: - """Tag all motifs in the sequence.""" - for motif_idx, motif in enumerate(self._motifs): - motif.set_tag(motif_idx) - - def clear_tags(self) -> None: - """Clear the tags for all motifs in the sequence.""" - for motif in self._motifs: - motif.clear_tag() diff --git a/src/versalign/version.py b/src/versalign/version.py index 6d8bc5f..2be948f 100644 --- a/src/versalign/version.py +++ b/src/versalign/version.py @@ -1,51 +1,11 @@ -# -*- coding: utf-8 -*- +"""Version information for versalign.""" -"""Version information for :mod:`versalign`. +from importlib.metadata import PackageNotFoundError, version -Run with ``python -m versalign.version`` -""" - -import os -from subprocess import CalledProcessError, check_output # noqa: S404 - -__all__ = [ - "VERSION", - "get_version", - "get_git_hash", -] - -VERSION = "0.0.2-dev" - - -def get_git_hash() -> str: - """Get the :mod:`versalign` git hash. - - :return: The git hash of the current commit. - :rtype: str - """ - with open(os.devnull, "w", encoding="utf-8") as devnull: - try: - ret = check_output( # noqa: S603,S607 - ["git", "rev-parse", "HEAD"], - cwd=os.path.dirname(__file__), - stderr=devnull, - ) - except CalledProcessError: - return "UNHASHED" - else: - return ret.strip().decode("utf-8")[:8] - - -def get_version(with_git_hash: bool = False): - """Get the :mod:`versalign` version string, including a git hash. - - :param with_git_hash: Whether to include the git hash in the version string. - :type with_git_hash: bool - :return: The version string. - :rtype: str - """ - return f"{VERSION}-{get_git_hash()}" if with_git_hash else VERSION - - -if __name__ == "__main__": - print(get_version(with_git_hash=True)) # noqa:T201 +try: + __version__ = version("versalign") +except PackageNotFoundError: + # When running in a source checkout and haven’t installed it yet, + # importlib.metadata might not find “versalign” in site‐packages. + # Fallback to a hard-coded default or raise an error: + __version__ = "0.0.0" diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index a062aea..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Tests for :mod:`versalign`.""" diff --git a/tests/test_integration_alignment.py b/tests/test_integration_alignment.py new file mode 100644 index 0000000..2c27ea5 --- /dev/null +++ b/tests/test_integration_alignment.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- + +"""Integration tests for alignment functionalities.""" + +import re +import math + +import numpy as np +import pytest + +from versalign.aligner import setup_aligner +from versalign.pairwise import calc_pairwise_alignment +from versalign.printing import format_alignment +from versalign.scoring import create_substituion_matrix_dynamically + + +purines = {"A", "G"} +pyrimidines = {"C", "T"} + + +def dna_ti_tv_compare(a: str, b: str) -> int: + """DNA: match=+2, transition=-1, transversion=-2, gap=-3 (gaps will be introduced by the aligner).""" + if a == b: + return 2 + if (a in purines and b in purines) or (a in pyrimidines and b in pyrimidines): + return -1 # transition + return -2 # transversion + + +hydrophobic = set("AILMVFWY") +polar = set("STNQ") +positive = set("KRH") +negative = set("DE") +special = set("CGP") + + +def aa_class_compare(a: str, b: str) -> int: + """Proteins by coarse chemical classes.""" + if a == b: + return 3 + def cls(x: str) -> str: + if x in hydrophobic: return "hydro" + if x in polar: return "polar" + if x in positive: return "pos" + if x in negative: return "neg" + if x in special: return "special" + return "other" + return 1 if cls(a) == cls(b) else -2 + + +def test_pairwise_dna_ti_tv_single_symbols(): + """ + Integration: dynamic matrix (DNA ti/tv), setup_aligner, calc_pairwise_alignment. + We use 1-character sequences so the aligned score equals the substitution score. + """ + objs = list("ACGT-") + sm, _ = create_substituion_matrix_dynamically(objs, compare=dna_ti_tv_compare) + aligner = setup_aligner(sm, "global") + + # A vs A -> +2 (match) + s, aln1, aln2 = calc_pairwise_alignment(aligner, list("A"), list("A"), gap_repr='-') + assert s == 2 + assert aln1 == ["A"] and aln2 == ["A"] + + # A vs G -> -1 (transition) + s, aln1, aln2 = calc_pairwise_alignment(aligner, list("A"), list("G"), gap_repr='-') + assert s == -1 + assert len(aln1) == len(aln2) == 1 + + # C vs A -> -2 (transversion) + s, aln1, aln2 = calc_pairwise_alignment(aligner, list("C"), list("A"), gap_repr='-') + assert s == -2 + assert len(aln1) == len(aln2) == 2 + + +def test_pairwise_protein_class_single_symbols(): + """ + Integration: protein class scoring. + A vs V are both hydrophobic -> +1 (same class, not identical) + A vs D is hydro vs negative -> -2 (different class) + """ + objs = list("ACDEFGHIKLMNPQRSTVWY-") + sm, _ = create_substituion_matrix_dynamically(objs, compare=aa_class_compare) + aligner = setup_aligner(sm, "global") + + # A vs V (hydro vs hydro) -> +1 + s, aln1, aln2 = calc_pairwise_alignment(aligner, list("A"), list("V"), gap_repr='-') + assert s == 1 + assert aln1 == ["A"] and aln2 == ["V"] + + # A vs D (hydro vs negative) -> -2 + s, aln1, aln2 = calc_pairwise_alignment(aligner, list("A"), list("D"), gap_repr='-') + assert s == -2 + assert aln1 == ["A", "-"] and aln2 == ["-", "D"] + + +def test_format_alignment_multichar_and_consensus_alignment(): + """ + Integration: verify the pretty-printer handles multi-char tokens, uneven name widths, + and a strict consensus line that only marks exact, non-gap matches. + We bypass the aligner here (feed aligned tokens directly) to make assertions crisp. + """ + aln1 = ["ALA", "---", "GLY", "THR", "VAL"] + aln2 = ["ALA", "SER", "GLY", "THR", "VAL"] + names = ["short", "a-very-long-name"] + + formatted = format_alignment( + [aln1, aln2], + names=names, + score=42.0, + gap_repr="---", + show_consensus=True, + ) + + # Contains the score header + assert formatted.splitlines()[0].strip() == "42.0" + + # Contains both name headers (right-justified left gutter) and multi-char tokens + assert "short > ALA --- GLY THR VAL" in formatted + assert "a-very-long-name > ALA SER GLY THR VAL" in formatted + + # Consensus should be present only at columns where tokens are identical and non-gap. + # Here: columns 1, 3, 4, 5 are identical; column 2 differs (--- vs SER). + lines = formatted.splitlines() + consensus_line = next(line for line in lines if line.strip().startswith("|") or line.rstrip().endswith("|")) + # Count '|' characters + assert consensus_line.count("|") == 4 diff --git a/tests/test_msa.py b/tests/test_msa.py deleted file mode 100644 index ce35770..0000000 --- a/tests/test_msa.py +++ /dev/null @@ -1,132 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Contains unit tests for the versalign.msa module.""" - -import typing as ty -import unittest - -from versalign.motif import Motif -from versalign.msa import multiple_sequence_alignment, pairwise_scoring_matrix -from versalign.sequence import Sequence - - -class A(Motif): - """Dummy motif.""" - - def __eq__(self, other: ty.Any) -> bool: - """Compare equality of motifs. - - :param other: The other motif to compare. - :type other: ty.Any - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - return isinstance(other, A) - - def __str__(self) -> str: - """Convert motif to string representation.""" - return "A" - - -class B(Motif): - """Dummy motif.""" - - def __eq__(self, other: ty.Any) -> bool: - """Compare equality of motifs. - - :param other: The other motif to compare. - :type other: ty.Any - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - return isinstance(other, B) - - def __str__(self) -> str: - """Convert motif to string representation.""" - return "B" - - -class C(Motif): - """Dummy motif.""" - - def __eq__(self, other: ty.Any) -> bool: - """Compare equality of motifs. - - :param other: The other motif to compare. - :type other: ty.Any - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - return isinstance(other, C) - - def __str__(self) -> str: - """Convert motif to string representation.""" - return "C" - - -def score_func(a: Motif, b: Motif) -> int: - """Score function for pairwise alignment.""" - if a == b: - return 1 - - return -1 - - -class TestPairwiseScoringMatrix(unittest.TestCase): - """Test the pairwise_scoring_matrix function.""" - - def test_pairwise_scoring_matrix_1(self) -> None: - """Test the pairwise_scoring_matrix function.""" - seq1 = Sequence("seq_a", [A(), A(), A(), A()]) - seq2 = Sequence("seq_b", [B(), B(), B(), B()]) - seq3 = Sequence("seq_c", [C(), C(), C(), C()]) - seqs = [seq1, seq2, seq3] - matrix = pairwise_scoring_matrix(seqs, 2, 1, score_func) - self.assertEqual(matrix.min_value, 0.0) - self.assertEqual(matrix.max_value, 4.0) - - def test_pairwise_scoring_matrix_2(self) -> None: - """Test the pairwise_scoring_matrix function.""" - seq1 = Sequence("seq_a", [A(), A(), A(), A()]) - seq2 = Sequence("seq_b", [A(), A(), A(), A()]) - seq3 = Sequence("seq_c", [B(), B(), B(), B()]) - seqs = [seq1, seq2, seq3] - matrix = pairwise_scoring_matrix(seqs, 2, 1, score_func) - self.assertEqual(matrix.min_value, 0.0) - self.assertEqual(matrix.max_value, 4.0) - - def test_pairwise_scoring_matrix_3(self) -> None: - """Test the pairwise_scoring_matrix function.""" - seq1 = Sequence("seq_a", [A(), A(), A(), A()]) - seq2 = Sequence("seq_b", [A(), A(), A(), A()]) - seq3 = Sequence("seq_c", [A(), A(), A(), A()]) - seqs = [seq1, seq2, seq3] - matrix = pairwise_scoring_matrix(seqs, 2, 1, score_func) - self.assertEqual(matrix.min_value, 4.0) - self.assertEqual(matrix.max_value, 4.0) - - -class TestMultipleSequenceAlignment(unittest.TestCase): - """Test the multiple_sequence_alignment function.""" - - def test_multiple_sequence_alignment_1(self) -> None: - """Test the multiple_sequence_alignment function.""" - seq1 = Sequence("seq_a", [A(), A(), A(), A()]) - seq2 = Sequence("seq_b", [B(), B(), B(), B()]) - seq3 = Sequence("seq_c", [C(), C(), C(), C()]) - seqs = [seq1, seq2, seq3] - result = multiple_sequence_alignment(seqs, 2, 1, score_func) - self.assertEqual(len(result), 3) - self.assertTrue(all(isinstance(seq, Sequence) for seq in result)) - self.assertTrue(all([len(seq) == 12 for seq in result])) - - def test_multiple_sequence_alignment_2(self) -> None: - """Test the multiple_sequence_alignment function.""" - seq1 = Sequence("seq_a", [A(), A(), A(), A()]) - seq2 = Sequence("seq_b", [B(), B(), B(), B()]) - seq3 = Sequence("seq_c", [B(), B(), B(), B()]) - seqs = [seq1, seq2, seq3] - result = multiple_sequence_alignment(seqs, 2, 1, score_func) - self.assertEqual(len(result), 3) - self.assertTrue(all(isinstance(seq, Sequence) for seq in result)) - self.assertTrue(all([len(seq) == 8 for seq in result])) diff --git a/tests/test_pairwise.py b/tests/test_pairwise.py deleted file mode 100644 index e358ac8..0000000 --- a/tests/test_pairwise.py +++ /dev/null @@ -1,138 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Contains unit tests for the versalign.pairwise module.""" - -import typing as ty -import unittest - -from versalign.motif import Gap, Motif -from versalign.pairwise import PairwiseAlignment, align_pairwise -from versalign.sequence import Sequence - -GLOBAL = PairwiseAlignment.NEEDLEMAN_WUNSCH -LOCAL = PairwiseAlignment.SMITH_WATERMAN - - -class A(Motif): - """Dummy motif.""" - - def __eq__(self, other: ty.Any) -> bool: - """Compare equality of motifs. - - :param other: The other motif to compare. - :type other: ty.Any - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - return isinstance(other, A) - - def __str__(self) -> str: - """Convert motif to string representation.""" - return "A" - - -class B(Motif): - """Dummy motif.""" - - def __eq__(self, other: ty.Any) -> bool: - """Compare equality of motifs. - - :param other: The other motif to compare. - :type other: ty.Any - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - return isinstance(other, B) - - def __str__(self) -> str: - """Convert motif to string representation.""" - return "B" - - -class C(Motif): - """Dummy motif.""" - - def __eq__(self, other: ty.Any) -> bool: - """Compare equality of motifs. - - :param other: The other motif to compare. - :type other: ty.Any - :return: True if the motifs are equal, False otherwise. - :rtype: bool - """ - return isinstance(other, C) - - def __str__(self) -> str: - """Convert motif to string representation.""" - return "C" - - -def score_func(a: Motif, b: Motif) -> int: - """Score function for pairwise alignment.""" - if a == b: - return 1 - - return -1 - - -class TestPairwiseNeedlemanWunsch(unittest.TestCase): - """Test pairwise alignment using Needleman-Wunsch algorithm.""" - - def test_align_sequence_1(self): - """Test aligning two sequences.""" - seq_a = Sequence("seq_a", [A(), A(), A(), A()]) - seq_b = Sequence("seq_b", [B(), B(), B(), B()]) - options = {"gap_penalty": 2, "end_gap_penalty": 1} - result_a, result_b, _ = align_pairwise(seq_a, seq_b, score_func, GLOBAL, options) - expected_a = [Gap(), Gap(), Gap(), Gap(), A(), A(), A(), A()] - expected_b = [B(), B(), B(), B(), Gap(), Gap(), Gap(), Gap()] - self.assertEqual(result_a._motifs, expected_a) - self.assertEqual(result_b._motifs, expected_b) - - def test_align_sequence_2(self): - """Test aligning two sequences.""" - seq_a = Sequence("seq_a", [A(), A(), A(), A()]) - seq_b = Sequence("seq_b", [A(), A(), A(), A()]) - options = {"gap_penalty": 2, "end_gap_penalty": 1} - result_a, result_b, _ = align_pairwise(seq_a, seq_b, score_func, GLOBAL, options) - expected_a = [A(), A(), A(), A()] - expected_b = [A(), A(), A(), A()] - self.assertEqual(result_a._motifs, expected_a) - self.assertEqual(result_b._motifs, expected_b) - - def test_align_sequence_3(self): - """Test aligning two sequences.""" - seq_a = Sequence("seq_a", [A(), A(), B(), B()]) - seq_b = Sequence("seq_b", [B(), B(), C(), C()]) - options = {"gap_penalty": 2, "end_gap_penalty": 1} - result_a, result_b, _ = align_pairwise(seq_a, seq_b, score_func, GLOBAL, options) - expected_a = [A(), A(), B(), B(), Gap(), Gap()] - expected_b = [Gap(), Gap(), B(), B(), C(), C()] - self.assertEqual(result_a._motifs, expected_a) - self.assertEqual(result_b._motifs, expected_b) - - def test_align_sequence_4(self): - """Test aligning two sequences.""" - seq_a = Sequence("seq_a", [A(), A(), A()]) - seq_b = Sequence("seq_b", [A(), A(), A(), A()]) - options = {"gap_penalty": 2, "end_gap_penalty": 1} - result_a, result_b, _ = align_pairwise(seq_a, seq_b, score_func, GLOBAL, options) - expected_a = [Gap(), A(), A(), A()] - expected_b = [A(), A(), A(), A()] - self.assertEqual(result_a._motifs, expected_a) - self.assertEqual(result_b._motifs, expected_b) - - -class TestPairwiseSmithWaterman(unittest.TestCase): - """Test pairwise alignment using Smith-Waterman algorithm.""" - - def test_align_sequence_1(self): - """Test aligning two sequences.""" - seq_a = Sequence("seq_a", [A(), A()]) - seq_b = Sequence("seq_b", [B(), B(), A(), A(), B()]) - options = {"gap_penalty": 2} - result_a, result_b, _ = align_pairwise(seq_a, seq_b, score_func, LOCAL, options) - expected_a = [A(), A()] - expected_b = [A(), A()] - self.assertEqual(result_a._motifs, expected_a) - self.assertEqual(result_b._motifs, expected_b) diff --git a/tests/test_version.py b/tests/test_version.py deleted file mode 100644 index b7443f0..0000000 --- a/tests/test_version.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Contains unit tests for the versalign.version module.""" - -import unittest - -from versalign.version import get_version - - -class TestVersion(unittest.TestCase): - """Trivially test a version.""" - - def test_version_type(self): - """Test the version is a string. - - This is only meant to be an example test. - """ - version = get_version() - self.assertIsInstance(version, str) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 35218b8..0000000 --- a/tox.ini +++ /dev/null @@ -1,176 +0,0 @@ -# Tox (http://tox.testrun.org/) is a tool for running tests -# in multiple virtualenvs. This configuration file will run the -# test suite on all supported python versions. To use it, "pip install tox" -# and then run "tox" from this directory. - -[tox] -# To use a PEP 517 build-backend you are required to configure tox to use an isolated_build: -# https://tox.readthedocs.io/en/latest/example/package.html -isolated_build = True - -# These environments are run in order if you just use `tox`: -envlist = - lint - pyroma - flake8 - mypy - docstr-coverage - py - -[testenv] -# Runs on the "tests" directory by default, or passes the positional -# arguments from `tox -e py ... -commands = - coverage run -p -m pytest --durations=20 {posargs:tests} - coverage combine - coverage xml -extras = - # See the [options.extras_require] entry in setup.cfg for "tests" - tests - -[testenv:doctests] -description = Test that documentation examples run properly -commands = - xdoctest -m src -deps = - xdoctest - pygments - -[testenv:treon] -description = Test that notebooks can run to completion -commants = - treon notebooks/ -deps = - treon - -[testenv:coverage-clean] -deps = coverage -skip_install = true -commands = coverage erase - -[testenv:lint] -deps = - black[jupyter] - isort - nbqa -skip_install = true -commands = - black . - isort . - nbqa isort . -description = Run linters. - -[testenv:doclint] -deps = - rstfmt -skip_install = true -commands = - rstfmt docs/source/ -description = Run documentation linters. - -[testenv:flake8] -skip_install = true -deps = - darglint - flake8 - flake8-black - flake8-bandit - flake8-bugbear - flake8-colors - flake8-docstrings - flake8-isort - flake8-print - pep8-naming - pydocstyle -commands = - flake8 src/ tests/ -description = Run the flake8 tool with several plugins (bandit, docstrings, import order, pep8 naming). See https://cthoyt.com/2020/04/25/how-to-code-with-me-flake8.html for more information. - -[testenv:pyroma] -deps = - pygments - pyroma -skip_install = true -commands = pyroma --min=10 . -description = Run the pyroma tool to check the package friendliness of the project. - -[testenv:mypy] -deps = - mypy - pydantic -skip_install = true -commands = mypy --install-types --non-interactive --ignore-missing-imports src/ -description = Run the mypy tool to check static typing on the project. - -[testenv:docstr-coverage] -skip_install = true -deps = - docstr-coverage -commands = - docstr-coverage src/ tests/ --skip-private --skip-magic -description = Run the docstr-coverage tool to check documentation coverage - -[testenv:coverage-report] -deps = coverage -skip_install = true -commands = - coverage combine - coverage report - -#################### -# Deployment tools # -#################### - -[testenv:bumpversion] -commands = bump2version {posargs} -skip_install = true -passenv = HOME -deps = - bump2version - -[testenv:build] -skip_install = true -deps = - wheel - build - setuptools -commands = - python -m build --sdist --wheel --no-isolation - -[testenv:release] -description = Release the code to PyPI so users can pip install it -skip_install = true -deps = - {[testenv:build]deps} - twine >= 1.5.0 -commands = - {[testenv:build]commands} - twine upload --skip-existing --config-file .pypirc dist/* - -[testenv:testrelease] -description = Release the code to the test PyPI site -skip_install = true -deps = - {[testenv:build]deps} - twine >= 1.5.0 -commands = - {[testenv:build]commands} - twine upload --skip-existing --config-file .pypirc -r testpypi dist/* - -[testenv:finish] -skip_install = true -passenv = - HOME - TWINE_USERNAME - TWINE_PASSWORD -deps = - {[testenv:release]deps} - bump2version -commands = - bump2version release --tag - {[testenv:release]commands} - git push --tags - bump2version patch - git push -allowlist_externals = - git \ No newline at end of file