diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..c07c1ff --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,4 @@ +{ + "image": "condaforge/miniforge3", + "postCreateCommand": "apt-get update && apt-get install -y build-essential && pip install -U -r dev-requirements.txt" +} \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4a410ce --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" \ No newline at end of file diff --git a/.github/workflows/pip_install_unitest.yml b/.github/workflows/pip_install_unitest.yml index afd1a5c..d7a1c42 100644 --- a/.github/workflows/pip_install_unitest.yml +++ b/.github/workflows/pip_install_unitest.yml @@ -1,60 +1,67 @@ name: pip_install_unitest on: - workflow_dispatch + workflow_dispatch: + inputs: + install_prerelease: + description: 'Check this to install the prerelease version of medspacy if available and the version is newer than formal release.' + type: boolean + required: false + default: false -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true jobs: - build_wheels: - name: Build wheel for ${{ matrix.python-version }}-${{ matrix.buildplat[1] }} - if: >- - github.event_name == 'schedule' || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, 'Build System')) - runs-on: ${{ matrix.buildplat[0] }} + + test: + # TODO: Do we care if this only runs on allowed branches since we check push/pull_request above? + # if: github.ref == 'refs/heads/master' OR github.ref == 'refs/heads/develop' strategy: - # Ensure that a wheel builder finishes even if another fails - fail-fast: false matrix: - # Github Actions doesn't support pairing matrix values together, let's improvise - # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 - buildplat: - - [ ubuntu-20.04, ubuntu-20.04 ] - - [ macos-10.15, macosx_10 ] - - [ macos-11, macosx_11 ] - - [ macos-12, macosx_12 ] - - [ windows-2019, windows-2019 ] - - [ windows-2022, windows-2022 ] - # spacy doesn't compile win32 - # python: ["cp36"] # Note: Wheels not needed for PyPy - python-version: [ "3.6", "3.7", "3.8", "3.9","3.10.x","3.11.0-rc.2"] # Note: Wheels not needed for PyPy - # python-version: [ "3.7"] - timeout-minutes: 45 + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: [3.9, "3.10.x", "3.11.x","3.12"] + # revised from https://github.com/actions/cache/blob/main/examples.md#python---pip + runs-on: ${{ matrix.os }} + steps: - - name: Checkout - uses: actions/checkout@v3 - - name: set up python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install cython - pip install PyRuSH==1.0.8 pytest - - - name: nose tests - run: | - # ls /opt/hostedtoolcache/Python/3*/x64/lib/python*/site-packages/conf - python -c "import shutil;shutil.rmtree('PyRuSH')" - python -c "import shutil;shutil.rmtree('conf')" - ls - python --version - pytest + - uses: actions/checkout@v4 + - name: Git clone repo and remove source code + run: | + pwd + ls + + - name: Set up pip + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: | + 'requirements/dev-requirements.txt' + - run: | + python --version + pip install --upgrade pip + + + - name: Install prereleased PyRuSH + if: ${{ github.event.inputs.install_prerelease == 'true' }} + run: | + # use this to avoid install prereleases of dependencies packages + pip install --pre PyRuSH + pip install -r dev-requirements.txt + + - name: Install formal released medspacy + if: ${{ github.event.inputs.install_prerelease == 'false' }} + run: | + pip install -I PyRuSH + + + - name: tests + run: | + # ls /opt/hostedtoolcache/Python/3*/x64/lib/python*/site-packages/conf + pip install pytest + python -c "import shutil;shutil.rmtree('PyRuSH')" + # python -c "import shutil;shutil.rmtree('conf')" + ls + python --version + pytest diff --git a/.github/workflows/pip_old_install_unitest.yml b/.github/workflows/pip_old_install_unitest.yml new file mode 100644 index 0000000..4e97cfc --- /dev/null +++ b/.github/workflows/pip_old_install_unitest.yml @@ -0,0 +1,68 @@ +name: pip_old_install_unitest +on: + workflow_dispatch: + inputs: + install_prerelease: + description: 'Check this to install the prerelease version of medspacy if available and the version is newer than formal release.' + type: boolean + required: false + default: false + + +jobs: + + test: + # TODO: Do we care if this only runs on allowed branches since we check push/pull_request above? + # if: github.ref == 'refs/heads/master' OR github.ref == 'refs/heads/develop' + strategy: + matrix: + os: [ubuntu-20.04, macos-12, windows-latest] + # python-version: [3.8] + python-version: [3.6,3.7] + # revised from https://github.com/actions/cache/blob/main/examples.md#python---pip + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + - name: Git clone repo and remove source code + run: | + pwd + ls + + - name: Set up pip + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: | + 'requirements/requirements.txt' + - run: | + python --version + pip install --upgrade pip + + + - name: Install prereleased PyRuSH + if: ${{ github.event.inputs.install_prerelease == 'true' }} + run: | + # use this to avoid install prereleases of dependencies packages + pip install -r requirements.txt + pip install --pre PyRuSH + + - name: Install formal released medspacy + if: ${{ github.event.inputs.install_prerelease == 'false' }} + run: | + pip install PyRuSH + + + - name: tests + run: | + # ls /opt/hostedtoolcache/Python/3*/x64/lib/python*/site-packages/conf + pip install pytest + python -c "import shutil;shutil.rmtree('PyRuSH')" + python -c "import shutil;shutil.rmtree('conf')" + ls + python --version + pytest + + + diff --git a/.github/workflows/run_pytests.yml b/.github/workflows/run_pytests.yml index d552e43..4322040 100644 --- a/.github/workflows/run_pytests.yml +++ b/.github/workflows/run_pytests.yml @@ -22,23 +22,22 @@ jobs: # Github Actions doesn't support pairing matrix values together, let's improvise # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 buildplat: - - [ ubuntu-20.04, ubuntu-20.04 ] - - [ macos-10.15, macosx_10 ] - - [ macos-11, macosx_11 ] - - [ macos-12, macosx_12 ] - - [ windows-2019, windows-2019 ] - - [ windows-2022, windows-2022 ] + - [ ubuntu-latest, ubuntu-latest ] + - [ macos-latest, macosx_latest ] + - [ windows-latest, windows-latest ] # spacy doesn't compile win32 # python: ["cp36"] # Note: Wheels not needed for PyPy - python-version: [ "3.6", "3.7", "3.8", "3.9","3.10.x","3.11"] # Note: Wheels not needed for PyPy + python-version: [ "3.9","3.10.x","3.11.x", "3.12"] # Note: Wheels not needed for PyPy timeout-minutes: 45 steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: set up python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: 'dev-requirements.txt' - name: Install dependencies run: | @@ -48,14 +47,15 @@ jobs: run: | python -m pip install --upgrade pip pip install -r dev-requirements.txt - pip install -U spacy==3.4.1 + pip install -U setuptools - name: pytests run: | python --version + python -c "import numpy;print(numpy.__version__)" + python setup.py build_ext --inplace pip install ./ pytest tests - diff --git a/.github/workflows/wheelbuilder.yml b/.github/workflows/wheelbuilder3.6.yml similarity index 71% rename from .github/workflows/wheelbuilder.yml rename to .github/workflows/wheelbuilder3.6.yml index 067256e..d2210f6 100644 --- a/.github/workflows/wheelbuilder.yml +++ b/.github/workflows/wheelbuilder3.6.yml @@ -1,4 +1,4 @@ -name: Wheel Builder +name: Build_Pub<3.9 on: workflow_dispatch @@ -18,29 +18,30 @@ jobs: timeout-minutes: 50 strategy: # Ensure that a wheel builder finishes even if another fails - fail-fast: false + fail-fast: true matrix: # Github Actions doesn't support pairing matrix values together, let's improvise # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 buildplat: - [ubuntu-20.04, manylinux_x86_64] - [ubuntu-20.04, musllinux_x86_64] - - [macos-10.15, macosx_*] - - [windows-2019, win_amd64] + - [macos-12, macosx_*] + - [windows-2022, win_amd64] # - [windows-2019, win32] # spacy doesn't compile win32 # python: ["cp37"] # Note: Wheels not needed for PyPy - python: ["cp36", "cp37", "cp38", "cp39", "cp310","cp311"] # Note: Wheels not needed for PyPy +# cp37 not working with manylinux_x86_64 somehow + python: ["cp36", "cp38"] # Note: Wheels not needed for PyPy steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install dependencies run: | python3 -m pip install --upgrade pip - name: Build wheels - uses: pypa/cibuildwheel@v2.11.1 + uses: pypa/cibuildwheel@v2.21.3 env: # TODO: Build Cython with the compile-all flag? # Unfortunately, there is no way to modify cibuildwheel's build command @@ -50,7 +51,7 @@ jobs: CIBW_BEFORE_BUILD: pip install -r requirements.txt CIBW_BUILD: ${{ matrix.python }}-${{ matrix.buildplat[1] }} CIBW_ENVIRONMENT: CFLAGS='-O3 -g0 -mtune=generic -pipe -fPIC' LDFLAGS='-fPIC' - CIBW_BEFORE_TEST: pip install pytest + CIBW_BEFORE_TEST: pip install -r dev-requirements.txt CIBW_TEST_COMMAND: pytest {package}/tests - name: check build @@ -58,51 +59,27 @@ jobs: ls -l wheelhouse - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: ${{ matrix.python }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} path: ./wheelhouse/*.whl - build_sdist: - name: Build sdist - runs-on: ubuntu-latest - steps: - - name: Checkout quicksectx - uses: actions/checkout@v3 - # Used to push the built wheels - - uses: actions/setup-python@v3 - with: - # Build sdist on lowest supported Python - python-version: '3.6' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - - - name: Build sdist - run: | - python setup.py sdist - - uses: actions/upload-artifact@v3 - with: - name: sdist - path: ./dist/*.tar.gz - upload_pypi: - needs: [build_wheels, build_sdist] + needs: [build_wheels] runs-on: ubuntu-latest # upload to PyPI on every tag starting with 'v' # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') # alternatively, to publish when a GitHub Release is created, use the following rule: # if: github.event_name == 'release' && github.event.action == 'published' steps: - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 - name: Install dependencies run: | pip install twine - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: path: dist diff --git a/.github/workflows/wheelbuilder3.9.yml b/.github/workflows/wheelbuilder3.9.yml new file mode 100644 index 0000000..914f25c --- /dev/null +++ b/.github/workflows/wheelbuilder3.9.yml @@ -0,0 +1,126 @@ +name: Build_Pub>=3.9 +on: + workflow_dispatch + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build_wheels: + name: Build wheel for ${{ matrix.python }}-${{ matrix.buildplat[1] }} + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'Build System')) + runs-on: ${{ matrix.buildplat[0] }} + timeout-minutes: 50 + strategy: + # Ensure that a wheel builder finishes even if another fails + fail-fast: false + matrix: + # Github Actions doesn't support pairing matrix values together, let's improvise + # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 + buildplat: + - [ubuntu-latest, manylinux_x86_64] + - [ubuntu-latest, musllinux_x86_64] + - [macos-13, macosx_x86_64] + - [macos-latest, macosx_arm64] + - [windows-latest, win_amd64] + # - [windows-2019, win32] +# spacy doesn't compile win32 +# python: ["cp37"] # Note: Wheels not needed for PyPy + python: ["cp39", "cp310","cp311", "cp312"] # Note: Wheels not needed for PyPy + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip --break-system-packages + + - name: Build wheels + uses: pypa/cibuildwheel@v3.1.4 + env: + # TODO: Build Cython with the compile-all flag? + # Unfortunately, there is no way to modify cibuildwheel's build command + # so there is no way to pass this in directly. + # This would require modifying cython's setup.py to look for these flags + # in env vars. + CIBW_BEFORE_BUILD: pip install -U -r dev-requirements.txt + CIBW_BUILD: ${{ matrix.python }}-${{ matrix.buildplat[1] }} + CIBW_ENVIRONMENT: CFLAGS='-O3 -g0 -mtune=generic -pipe -fPIC' LDFLAGS='-fPIC' + CIBW_BEFORE_TEST: pip install -r dev-requirements.txt + CIBW_TEST_COMMAND: pytest {package}/tests + + - name: check build + run: | + ls -l wheelhouse + + + - uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.python }}-${{ matrix.buildplat[1] }}-${{ matrix.buildplat[0] }} + path: ./wheelhouse/*.whl + + + + build_sdist: + name: Build sdist + runs-on: ubuntu-latest + steps: + - name: Checkout quicksectx + uses: actions/checkout@v4 + # Used to push the built wheels + - uses: actions/setup-python@v5 + with: + # Build sdist on lowest supported Python + python-version: '3.10' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -U -r dev-requirements.txt + + - name: Build sdist + run: | + python setup.py sdist + - uses: actions/upload-artifact@v4 + with: + name: sdist + path: ./dist/*.tar.gz + + upload_pypi: + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + permissions: + id-token: write + # upload to PyPI on every tag starting with 'v' +# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + # alternatively, to publish when a GitHub Release is created, use the following rule: + # if: github.event_name == 'release' && github.event.action == 'published' + steps: + - uses: actions/setup-python@v5 + - name: Install dependencies + run: | + pip install twine + + - uses: actions/download-artifact@v4 + with: + path: dist + + - name: check downloaded + run: | + find ./dist -type f \( -name "*.whl" -o -name "*.tar.gz" \) -exec mv {} ./dist/ \; + rm -rf dist/cp* + rm -rf dist/sdist + ls -R + + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist + skip-existing: true + verbose: true + + diff --git a/.gitignore b/.gitignore index 3f0afaf..f414037 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ dist *.so *.c *.cpp -.ipynb_checkpoints \ No newline at end of file +.ipynb_checkpoints +__pycache__ +*.pyd \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1f77e20 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [], + "chat.tools.autoApprove": true, + "chat.agent.maxRequests": 200 +} \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 6a5cefc..d874685 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ +include LICENSE include conf/rush_rules.tsv include PyRuSH/StaticSentencizerFun.pyx include requirements.txt diff --git a/PyRuSH/PyRuSHSentencizer.py b/PyRuSH/PyRuSHSentencizer.py index f296af5..1c7a6b1 100644 --- a/PyRuSH/PyRuSHSentencizer.py +++ b/PyRuSH/PyRuSHSentencizer.py @@ -19,21 +19,28 @@ from spacy.pipeline import Sentencizer from .RuSH import RuSH -from .StaticSentencizerFun import cpredict, cset_annotations +from .StaticSentencizerFun import cpredict_merge_gaps,cpredict_split_gaps, cset_annotations @Language.factory("medspacy_pyrush") class PyRuSHSentencizer(Sentencizer): def __init__(self, nlp: Language, name: str = "medspacy_pyrush", rules_path: str = '', max_repeat: int = 50, - auto_fix_gaps: bool = True) -> Sentencizer: + auto_fix_gaps: bool = True, merge_gaps: bool = False, max_sentence_length: int = None) -> Sentencizer: """ + Initialize the PyRuSH sentencizer component. - @param rules_path: The string of the rule file path or rules themselves. By default, it will look for - rush_rules.tsv in the site_packages/conf folder. - @param max_repeat: Total number of replicates that allows to be handled by "+" wildcard. - @param auto_fix_gaps: If gaps are caused by malcrafted rules, try to fix them. - However, this has no control of sentence end, - TODO: need to see how the downsteam spacy components make use of doc.c + Args: + nlp (Language): The spaCy language pipeline. + name (str): Name of the component. Default is "medspacy_pyrush". + rules_path (str): Path to the rule file or rules themselves. If empty, defaults to 'conf/rush_rules.tsv'. + max_repeat (int): Maximum number of repeats allowed for the '+' wildcard in rules. + auto_fix_gaps (bool): If True, attempts to fix gaps caused by malformed rules. + merge_gaps (bool): If True, merges gaps between sentences into the preceding sentence. If False, splits gaps (might be multiple whitespaces or new line characters) into separate sentences. + max_sentence_length (int or None): Maximum allowed sentence length in characters. If set, sentences longer than this will be split. + + Notes: + - Setting merge_gaps controls whether gaps are merged or split. + - max_sentence_length applies to both merge and split modes. """ self.nlp = nlp self.name = name @@ -43,28 +50,66 @@ def __init__(self, nlp: Language, name: str = "medspacy_pyrush", rules_path: str rules_path = str(os.path.join(root, 'conf', 'rush_rules.tsv')) self.rules_path = rules_path self.rush = RuSH(rules=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps) + self.merge_gaps = merge_gaps + self.max_sentence_length = max_sentence_length @classmethod def from_nlp(cls, nlp, **cfg): + """ + Create a PyRuSHSentencizer instance from a spaCy nlp object and configuration. + + Args: + nlp (Language): The spaCy language pipeline. + **cfg: Additional configuration parameters for initialization. + + Returns: + PyRuSHSentencizer: An initialized sentencizer instance. + """ return cls(**cfg) def __call__(self, doc): + """ + Apply sentence boundary detection to a spaCy Doc and set sentence start annotations. + + Args: + doc (Doc): The spaCy Doc to process. + + Returns: + Doc: The processed Doc with sentence boundaries set. + """ tags = self.predict([doc]) cset_annotations([doc], tags) return doc def predict(self, docs): - """Apply the pipeline's model to a batch of docs, without - modifying them. """ - guesses = cpredict(docs, self.rush.segToSentenceSpans) + Predict sentence boundaries for a batch of spaCy Docs. + + Args: + docs (list of Doc): List of spaCy Docs to process. + + Returns: + list of list of bool: Sentence start guesses for each Doc. + + Notes: + - Does not modify the Docs; only returns sentence start predictions. + """ + if self.merge_gaps: + guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length) + else: + guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length) return guesses def set_annotations(self, docs, batch_tag_ids, tensors=None): """ - This function overwrite spacy's Sentencizer. + Set sentence boundary annotations on spaCy Docs. + + Args: + docs (list of Doc): List of spaCy Docs to annotate. + batch_tag_ids (list of list of bool): Sentence start tags for each Doc. + tensors: Placeholder for future extensions (optional). - @param batch_tag_ids: a list of doc's tags (a list of boolean values) - @param tensors: a place holder for future extensions + Notes: + - This method overwrites spaCy's Sentencizer annotations. """ cset_annotations(docs, batch_tag_ids, tensors) diff --git a/PyRuSH/RuSH.py b/PyRuSH/RuSH.py index aee98c2..33df884 100644 --- a/PyRuSH/RuSH.py +++ b/PyRuSH/RuSH.py @@ -27,8 +27,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging -import logging.config +from loguru import logger import os.path from typing import Union, List @@ -40,40 +39,7 @@ def initLogger(): - config_files = ['../../../conf/logging.ini', '../../conf/logging.ini', '../conf/logging.ini', 'conf/logging.ini', - 'logging.ini'] - config_file = None - for f in config_files: - if os.path.isfile(f): - config_file = f - break - if config_file is None: - config_file = config_files[-1] - with open(config_file, 'w') as f: - f.write('''[loggers] -keys=root - -[handlers] -keys=consoleHandler - -[formatters] -keys=simpleFormatter - -[logger_root] -level=WARNING -handlers=consoleHandler - -[handler_consoleHandler] -class=StreamHandler -level=WARNING -formatter=simpleFormatter -args=(sys.stdout,) - -[formatter_simpleFormatter] -format=%(asctime)s - %(name)s - %(levelname)s - %(message)s -datefmt= -''') - logging.config.fileConfig(config_file) + pass # Removed: logging config logic for Loguru migration class RuSH: @@ -84,9 +50,7 @@ def __init__(self, rules: Union[str, List] = '', max_repeat: int = 50, auto_fix_ self.fastner = FastCNER(rules, max_repeat) self.fastner.span_compare_method = 'scorewidth' if enable_logger: - initLogger() - self.logger = logging.getLogger(__name__) - print(self.logger.level) + self.logger = logger else: self.logger = None self.auto_fix_gaps = auto_fix_gaps @@ -109,13 +73,13 @@ def segToSentenceSpans(self, text): self.fastner.process(text, 0, result) # log important message for debugging use - if self.logger is not None and self.logger.isEnabledFor(logging.DEBUG): + if self.logger is not None: text = text.replace('\n', ' ') for concept_type, spans in result.items(): - self.logger.debug(concept_type) + self.logger.opt(lazy=True).debug(concept_type) for span in spans: rule = self.fastner.rule_store[span.rule_id] - self.logger.debug( + self.logger.opt(lazy=True).debug( '\t{0}-{1}:{2}\t{3}<{4}>\t[Rule {5}:\t{6}\t{7}\t{8}\t{9}]'.format(span.begin, span.end, span.score, text[:span.begin], @@ -185,15 +149,15 @@ def segToSentenceSpans(self, text): if trimed_gap is not None and trimed_gap.width > self.min_sent_chars: output.append(trimed_gap) - if self.logger is not None and self.logger.isEnabledFor(logging.DEBUG): + if self.logger is not None: for sentence in output: - self.logger.debug( + self.logger.opt(lazy=True).debug( 'Sentence({0}-{1}):\t>{2}<'.format(sentence.begin, sentence.end, text[sentence.begin:sentence.end])) return output @staticmethod - def fix_gap(sentences: [], text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5): + def fix_gap(sentences: list, text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5): trimed_gap = RuSH.trim_gap(text, previous_end, this_begin) if trimed_gap is None: return @@ -203,7 +167,7 @@ def fix_gap(sentences: [], text: str, previous_end: int, this_begin: int, min_se sentences[-1].end = trimed_gap.end @staticmethod - def trim_gap(text: str, previous_end: int, this_begin: int) -> Span: + def trim_gap(text: str, previous_end: int, this_begin: int) -> 'Span | None': begin = -1 alnum_begin = -1 end = 0 diff --git a/PyRuSH/StaticSentencizerFun.pyx b/PyRuSH/StaticSentencizerFun.pyx index b22f67c..e250f02 100644 --- a/PyRuSH/StaticSentencizerFun.pyx +++ b/PyRuSH/StaticSentencizerFun.pyx @@ -1,3 +1,4 @@ +from loguru import logger # ****************************************************************************** # MIT License # @@ -15,40 +16,184 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ****************************************************************************** -cpdef cpredict(docs, sentencizer_fun): +cpdef cpredict_merge_gaps(docs, sentencizer_fun, max_sentence_length=None): cdef list guesses - cdef int s - cdef int t guesses = [] - for doc in docs: + logger.debug(f"cpredict_merge_gaps called: docs={len(docs)}, max_sentence_length={max_sentence_length}") + for doc_idx, doc in enumerate(docs): + if len(doc) == 0: + guesses.append([]) + continue + doc_guesses = [False] * len(doc) + spans = sentencizer_fun(doc.text) + logger.debug(f"[doc {doc_idx}] {len(spans)} spans detected: {[ (span.begin, span.end) for span in spans ]}") + t = 0 + span_idx = 0 + num_spans = len(spans) + sentence_start_t = 0 + last_ws_token = None + while t < len(doc): + token = doc[t] + # 1. Mark token as sentence start if it overlaps with RuSH span.begin + if span_idx < num_spans and token.idx == spans[span_idx].begin: + doc_guesses[t] = True + logger.debug(f"[doc {doc_idx}] Mark sentence start at token {t}: '{token.text}' idx={token.idx} (span begin)") + sentence_start_t = t + last_ws_token = None + span = spans[span_idx] + # Find last token in span + last_token_in_span = t + while last_token_in_span + 1 < len(doc) and doc[last_token_in_span + 1].idx < span.end: + last_token_in_span += 1 + if last_token_in_span + 1 < len(doc) and doc[last_token_in_span + 1].idx >= span.end: + # Mark whitespace preferred, else next token + if doc[last_token_in_span + 1].text.isspace(): + doc_guesses[last_token_in_span + 1] = True + logger.debug(f"[doc {doc_idx}] Token {last_token_in_span+1} '{doc[last_token_in_span+1].text}' marked as sentence start (span end whitespace)") + else: + doc_guesses[last_token_in_span + 1] = True + logger.debug(f"[doc {doc_idx}] Token {last_token_in_span+1} '{doc[last_token_in_span+1].text}' marked as sentence start (span end next token)") + span_idx += 1 + t += 1 + continue + # 2. If token is in gap between spans + if span_idx > 0 and token.idx >= spans[span_idx-1].end and (span_idx < num_spans and token.idx < spans[span_idx].begin): + gap_start = t + gap_end = t + while gap_end < len(doc) and doc[gap_end].idx < spans[span_idx].begin: + gap_end += 1 + whitespace_found = False + for i in range(gap_start, gap_end): + if doc[i].text.isspace(): + doc_guesses[i] = True + logger.debug(f"[doc {doc_idx}] Mark sentence start at token {i}: '{doc[i].text}' idx={doc[i].idx} (gap whitespace)") + whitespace_found = True + if i+1 < gap_end and not doc[i+1].text.isspace(): + doc_guesses[i+1] = True + logger.debug(f"[doc {doc_idx}] Mark sentence start at token {i+1}: '{doc[i+1].text}' idx={doc[i+1].idx} (gap non-whitespace after whitespace)") + break + if not whitespace_found: + for i in range(gap_start, gap_end): + if not doc[i].text.isspace(): + doc_guesses[i] = True + logger.debug(f"[doc {doc_idx}] Mark sentence start at token {i}: '{doc[i].text}' idx={doc[i].idx} (gap non-whitespace)") + break + t = gap_end + continue + # 3. Split at last whitespace or previous token BEFORE exceeding max length + if max_sentence_length is not None: + sentence_len = 0 + last_ws_token = -1 + for k in range(sentence_start_t, t): + sentence_len += len(doc[k].text) + if doc[k].text.isspace(): + last_ws_token = k + current_token_len = len(token.text) + # If adding current token would exceed max length + if sentence_len + current_token_len > max_sentence_length: + # Find split point: last whitespace before limit, else previous token + split_token = last_ws_token if last_ws_token >= sentence_start_t else t-1 if t > sentence_start_t else t + # Prevent split_token from being the same as sentence_start_t (in case no whitespace and only one token) + if split_token == sentence_start_t and t > sentence_start_t: + split_token = t-1 + doc_guesses[split_token] = True + logger.debug(f"[doc {doc_idx}] Mark/Split due to max_sentence_length at token {split_token}: '{doc[split_token].text}' idx={doc[split_token].idx} (split before exceeding limit)") + sentence_start_t = split_token + continue + t += 1 + logger.debug(f"[doc {doc_idx}] Sentence start guesses: {[i for i, v in enumerate(doc_guesses) if v]}") + guesses.append(doc_guesses) + return guesses + +cpdef cpredict_split_gaps(docs, sentencizer_fun, max_sentence_length=None): + cdef list guesses + guesses = [] + call_id = getattr(cpredict_split_gaps, 'call_id', 0) + setattr(cpredict_split_gaps, 'call_id', call_id + 1) + for doc_idx, doc in enumerate(docs): if len(doc) == 0: guesses.append([]) continue doc_guesses = [False] * len(doc) sentence_spans = sentencizer_fun(doc.text) - s = 0 + num_spans = len(sentence_spans) t = 0 - while s < len(sentence_spans) and t < len(doc): - span = sentence_spans[s] + span_idx = 0 + sentence_start_idx = 0 + is_first_token_in_span = True + while t < len(doc): token = doc[t] - if len(token.text.strip()) == 0: + # Always check for gaps between spans before advancing span_idx + next_span_begin = sentence_spans[span_idx + 1].begin if span_idx < num_spans - 1 else -1 + # 1. Handle gaps between spans + if span_idx < num_spans - 1 and token.idx >= sentence_spans[span_idx].end and token.idx < next_span_begin: + gap_start = t + gap_end = t + while gap_end < len(doc) and doc[gap_end].idx < next_span_begin: + gap_end += 1 + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] GAP DETECTED: tokens {gap_start}-{gap_end-1} (idx {doc[gap_start].idx}-{doc[gap_end-1].idx}) between spans {sentence_spans[span_idx].end}-{next_span_begin}") + # Mark first whitespace token in gap, else first token + whitespace_idx = -1 + for i in range(gap_start, gap_end): + if doc[i].text.isspace(): + whitespace_idx = i + break + if whitespace_idx != -1: + doc_guesses[whitespace_idx] = True + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] Token {whitespace_idx} '{doc[whitespace_idx].text}' marked as sentence start (whitespace in gap between spans)") + else: + doc_guesses[gap_start] = True + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] Token {gap_start} '{doc[gap_start].text}' marked as sentence start (first token in gap between spans)") + t = gap_end + continue + # 2. Advance span_idx if needed + while span_idx < num_spans and token.idx >= sentence_spans[span_idx].end: + span_idx += 1 + if span_idx >= num_spans: + # After all spans, only mark whitespace tokens as sentence start + if token.text.isspace(): + doc_guesses[t] = True + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] Token {t} '{token.text}' marked as sentence start (whitespace after all spans)") t += 1 continue - if token.idx <= span.begin < token.idx + len(token): - doc_guesses[t] = True + span = sentence_spans[span_idx] + # 3. If before the span, skip + if token.idx < span.begin: t += 1 - s += 1 - elif token.idx + len(token) <= span.begin: + continue + # 4. If in the span + if token.idx < span.end: + # Mark sentence start if token overlaps with span.begin + if token.idx == span.begin: + doc_guesses[t] = True + sentence_start_idx = token.idx + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] Token {t} '{token.text}' marked as sentence start (span begin)") + # If sentence length exceeds max_sentence_length, mark as sentence start + elif max_sentence_length is not None and (token.idx - sentence_start_idx) + len(token.text) > max_sentence_length: + doc_guesses[t] = True + sentence_start_idx = token.idx + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] Token {t} '{token.text}' marked as sentence start (max length split in span {span_idx})") + # If this is the last token in the span, mark next token as sentence start (if exists) + if t + 1 < len(doc) and doc[t + 1].idx >= span.end: + # Mark whitespace preferred, else next token + if doc[t + 1].text.isspace(): + doc_guesses[t + 1] = True + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] Token {t+1} '{doc[t+1].text}' marked as sentence start (span end whitespace)") + else: + doc_guesses[t + 1] = True + logger.debug(f"[cpredict_split_gaps|call_id={call_id}] [doc {doc_idx}] Token {t+1} '{doc[t+1].text}' marked as sentence start (span end next token)") t += 1 - else: - s += 1 + continue + t += 1 + logger.debug(f'[cpredict_split_gaps|call_id={call_id}] Token/tag mapping: ' + str([(d, l) for d, l in zip(list(doc), doc_guesses)])) guesses.append(doc_guesses) return guesses cpdef cset_annotations(docs, batch_tag_ids, tensors=None): + if type(docs) !=list: docs = [docs] - for i, doc in enumerate(docs): + for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] for j, tag_id in enumerate(doc_tag_ids): # Don't clobber existing sentence boundaries diff --git a/PyRuSH/__init__.py b/PyRuSH/__init__.py index fe7d966..bc28b14 100644 --- a/PyRuSH/__init__.py +++ b/PyRuSH/__init__.py @@ -30,7 +30,7 @@ from .PyRuSHSentencizer import PyRuSHSentencizer from .RuSH import RuSH, BEGIN, END -from .version import __version__ +__version__ = '1.0.12' diff --git a/PyRuSH/version.py b/PyRuSH/version.py deleted file mode 100644 index 590a0a9..0000000 --- a/PyRuSH/version.py +++ /dev/null @@ -1,20 +0,0 @@ -__version__ = '1.0.8' -if __name__ == '__main__': - print(__version__) -# ****************************************************************************** -# MIT License -# -# Copyright (c) 2020 Jianlin Shi -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation -# files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, -# modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# ****************************************************************************** diff --git a/README.rst b/README.rst index 1464a03..0bb2dc0 100644 --- a/README.rst +++ b/README.rst @@ -52,3 +52,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py A Colab Notebook Demo --------------------------- Feel free to try this runnable `Colab notebook Demo `_ + +Revision History +---------------- + +**1.0.11 (2025-09-02)** + +- Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit. +- Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences. + +**1.0.9 (2024-10-27)** + +- Initial release with spaCy 3.x compatibility and core RuSH logic. +- Added Spacy-compatible PyRuSHSentencizer component. diff --git a/conf/rush_rules.tsv b/conf/rush_rules.tsv index 6070aa0..0460155 100644 --- a/conf/rush_rules.tsv +++ b/conf/rush_rules.tsv @@ -20,7 +20,7 @@ #stbegin is the marker for sentence begin, the span of sentence will start at the begin of the captured group #stbegin has two scores 0, 1: 0 for true sentence begin clues, 1 for false sentence begin clues which will overwrite 0-scored rules when they are overlapping. -#stend is the marker for sentence end, the span of sentence will end at the end of the captured group +#stend is the marker for sentence begin, the span of sentence will end at the end of the captured group #stend also has two scores 2, 3: 2 for true sentence end clues, 3 for false sentence end clues which will overwrite 2-scored rules when they are overlapping # \b the begin of an input diff --git a/dev-requirements.txt b/dev-requirements.txt index 5006211..1ffb0d2 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,4 +1,11 @@ -spacy>=3.0.0 +Cython +setuptools +spacy<3.8; python_version < "3.12" +spacy>=3.8; python_version >= "3.12" PyFastNER>=1.0.8 -pytest quicksectx>=0.3.5 +pytest +numpy +wheel +loguru +medspacy \ No newline at end of file diff --git a/notebooks/debug.ipynb b/notebooks/debug.ipynb new file mode 100644 index 0000000..13f896f --- /dev/null +++ b/notebooks/debug.ipynb @@ -0,0 +1,780 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a297d69d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from PyRuSH import PyRuSHSentencizer\n", + "from spacy.lang.en import English\n", + "from PyRuSH import RuSH\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c9128bb3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "rush = RuSH(os.path.join('../conf/rush_rules.tsv'), enable_logger=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "81ef94a6", + "metadata": {}, + "outputs": [], + "source": [ + "input_str = ''' \n", + "\n", + " \n", + " Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina.\n", + "We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease.\n", + "She may continue in the future to have angina and she will have nitroglycerin available for that if needed.\n", + "Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor.\n", + "So her discharge meds are as follows:\n", + "1. Coreg 6.25 mg b.i.d.\n", + "2. Simvastatin 40 mg nightly.\n", + "3. Lisinopril 5 mg b.i.d.\n", + "4. Protonix 40 mg a.m.\n", + "5. Aspirin 160 mg a day.\n", + "6. Lasix 20 mg b.i.d.\n", + "7. Spiriva puff daily.\n", + "8. Albuterol p.r.n. q.i.d.\n", + "9. Advair 500/50 puff b.i.d.\n", + "10. Xopenex q.i.d. and p.r.n.\n", + "I will see her in a month to six weeks. She is to follow up with Dr. X before that.\n", + " \n", + "\n", + "\n", + " Ezoic - MTSam Sample Bottom Matched Content - native_bottom \n", + "\n", + "\n", + "\n", + "\n", + " End Ezoic - MTSam Sample Bottom Matched Content - native_bottom\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2f8f59b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - stbegin\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t140-144:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. \t[Rule 959:\t.\\w+(She \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t197-200:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. \t[Rule 960:\t.\\w+(We \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t249-252:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. \t[Rule 571:\t.\\s+(The\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t308-311:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. \t[Rule 960:\t.\\w+(We \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t338-343:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. \t[Rule 970:\t.\\w+(This \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t451-455:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. \t[Rule 959:\t.\\w+(She \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t559-563:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. \t[Rule 958:\t.\\w+(Her \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t671-676:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. \t[Rule 970:\t.\\w+(This \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t767-771:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. \t[Rule 959:\t.\\w+(She \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t800-801:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. \t[Rule 210:\t\\n+(\\C)\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t838-843:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: <1>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t140-144:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. \t[Rule 959:\t.\\w+(She \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t197-200:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. \t[Rule 960:\t.\\w+(We \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t249-252:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. \t[Rule 571:\t.\\s+(The\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t308-311:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. \t[Rule 960:\t.\\w+(We \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,934 - PyRuSH.RuSH - DEBUG - \t338-343:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. \t[Rule 970:\t.\\w+(This \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t451-455:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. \t[Rule 959:\t.\\w+(She \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t559-563:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. \t[Rule 958:\t.\\w+(Her \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t671-676:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. \t[Rule 970:\t.\\w+(This \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t767-771:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. \t[Rule 959:\t.\\w+(She \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t800-801:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. \t[Rule 210:\t\\n+(\\C)\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,941 - PyRuSH.RuSH - DEBUG - \t838-843:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: <1>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,946 - PyRuSH.RuSH - DEBUG - \t863-868:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. <2>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,946 - PyRuSH.RuSH - DEBUG - \t894-899:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. <3>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,948 - PyRuSH.RuSH - DEBUG - \t921-926:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. <4>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,950 - PyRuSH.RuSH - DEBUG - \t945-950:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. <5>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,950 - PyRuSH.RuSH - DEBUG - \t971-976:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. <6>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,953 - PyRuSH.RuSH - DEBUG - \t994-999:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. <7>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,954 - PyRuSH.RuSH - DEBUG - \t1018-1023:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. <8>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,955 - PyRuSH.RuSH - DEBUG - \t1046-1051:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. <9>\t[Rule 790:\t\\a\\n+(\\d.\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,955 - PyRuSH.RuSH - DEBUG - \t1076-1078:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. <1>\t[Rule 218:\t\\n(\\d+).\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,957 - PyRuSH.RuSH - DEBUG - \t1081-1082:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. \t[Rule 62:\t\\d.\\s+(\\C)\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,958 - PyRuSH.RuSH - DEBUG - \t1107-1109:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. \t[Rule 965:\t.\\w+(I \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,958 - PyRuSH.RuSH - DEBUG - \t1148-1152:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. \t[Rule 959:\t.\\w+(She \tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,959 - PyRuSH.RuSH - DEBUG - \t1204-1205:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. \t[Rule 63:\t\\n\\n\\s+(\\C)\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,960 - PyRuSH.RuSH - DEBUG - \t1210-1211:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. Ezoic <->\t[Rule 812:\t\\c\\s+(-)\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,961 - PyRuSH.RuSH - DEBUG - \t1270-1271:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. Ezoic - MTSam Sample Bottom Matched Content - native_bottom \t[Rule 63:\t\\n\\n\\s+(\\C)\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,961 - PyRuSH.RuSH - DEBUG - \t1280-1281:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. Ezoic - MTSam Sample Bottom Matched Content - native_bottom End Ezoic <->\t[Rule 812:\t\\c\\s+(-)\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:58:16,962 - PyRuSH.RuSH - DEBUG - stend\n", + "2025-08-31 20:58:16,964 - PyRuSH.RuSH - DEBUG - \t137-138:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency<.>\t[Rule 410:\t\\c\\c(.)\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,964 - PyRuSH.RuSH - DEBUG - \t195-196:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,965 - PyRuSH.RuSH - DEBUG - \t246-247:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG<.>\t[Rule 419:\t\\C\\C\\C(.)\\s+\\C\\c\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,966 - PyRuSH.RuSH - DEBUG - \t305-306:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001<.>\t[Rule 353:\t\\d(.)\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,966 - PyRuSH.RuSH - DEBUG - \t335-336:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram<.>\t[Rule 410:\t\\c\\c(.)\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,968 - PyRuSH.RuSH - DEBUG - \t449-450:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,969 - PyRuSH.RuSH - DEBUG - \t557-558:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,970 - PyRuSH.RuSH - DEBUG - \t668-669:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d<.>\t[Rule 940:\t\\c.\\c(.)\\w+\\C\\c\\c\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,970 - PyRuSH.RuSH - DEBUG - \t713-714:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d<.>\t[Rule 317:\t\\a(.) +\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,971 - PyRuSH.RuSH - DEBUG - \t764-765:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case<.>\t[Rule 410:\t\\c\\c(.)\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,971 - PyRuSH.RuSH - DEBUG - \t798-799:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,972 - PyRuSH.RuSH - DEBUG - \t836-837:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows<:>\t[Rule 413:\t\\c(:)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,973 - PyRuSH.RuSH - DEBUG - \t861-862:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,974 - PyRuSH.RuSH - DEBUG - \t892-893:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,975 - PyRuSH.RuSH - DEBUG - \t919-920:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,976 - PyRuSH.RuSH - DEBUG - \t943-944:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,976 - PyRuSH.RuSH - DEBUG - \t969-970:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,977 - PyRuSH.RuSH - DEBUG - \t992-993:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,978 - PyRuSH.RuSH - DEBUG - \t1016-1017:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,979 - PyRuSH.RuSH - DEBUG - \t1037-1038:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n<.>\t[Rule 317:\t\\a(.) +\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,980 - PyRuSH.RuSH - DEBUG - \t1044-1045:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,981 - PyRuSH.RuSH - DEBUG - \t1074-1075:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,982 - PyRuSH.RuSH - DEBUG - \t1094-1095:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d<.>\t[Rule 317:\t\\a(.) +\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,983 - PyRuSH.RuSH - DEBUG - \t1105-1106:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,984 - PyRuSH.RuSH - DEBUG - \t1145-1146:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks<.>\t[Rule 410:\t\\c\\c(.)\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,985 - PyRuSH.RuSH - DEBUG - \t1190-1191:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that<.>\t[Rule 423:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,986 - PyRuSH.RuSH - DEBUG - \t1192-1205:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. < >\t[Rule 523:\t\\s+\\n\\n+\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,987 - PyRuSH.RuSH - DEBUG - \t1208-1213:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. Ezoi\t[Rule 813:\t\\c\\s+-\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,988 - PyRuSH.RuSH - DEBUG - \t1263-1271:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. Ezoic - MTSam Sample Bottom Matched Content - native_bottom< >\t[Rule 523:\t\\s+\\n\\n+\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,988 - PyRuSH.RuSH - DEBUG - \t1278-1283:1.0\t Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina. We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease. She may continue in the future to have angina and she will have nitroglycerin available for that if needed. Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d. This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case. She also is on an ACE inhibitor. So her discharge meds are as follows: 1. Coreg 6.25 mg b.i.d. 2. Simvastatin 40 mg nightly. 3. Lisinopril 5 mg b.i.d. 4. Protonix 40 mg a.m. 5. Aspirin 160 mg a day. 6. Lasix 20 mg b.i.d. 7. Spiriva puff daily. 8. Albuterol p.r.n. q.i.d. 9. Advair 500/50 puff b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. Ezoic - MTSam Sample Bottom Matched Content - native_bottom End Ezoi\t[Rule 813:\t\\c\\s+-\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:58:16,989 - PyRuSH.RuSH - DEBUG - Sentence(27-138):\t>Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency.<\n", + "2025-08-31 20:58:16,990 - PyRuSH.RuSH - DEBUG - Sentence(140-196):\t>She has chronic lung disease with bronchospastic angina.<\n", + "2025-08-31 20:58:16,991 - PyRuSH.RuSH - DEBUG - Sentence(197-247):\t>We discovered new T-wave abnormalities on her EKG.<\n", + "2025-08-31 20:58:16,992 - PyRuSH.RuSH - DEBUG - Sentence(249-306):\t>There was of course a four-vessel bypass surgery in 2001.<\n", + "2025-08-31 20:58:16,993 - PyRuSH.RuSH - DEBUG - Sentence(308-336):\t>We did a coronary angiogram.<\n", + "2025-08-31 20:58:16,993 - PyRuSH.RuSH - DEBUG - Sentence(338-450):\t>This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease.<\n", + "2025-08-31 20:58:16,993 - PyRuSH.RuSH - DEBUG - Sentence(451-558):\t>She may continue in the future to have angina and she will have nitroglycerin available for that if needed.<\n", + "2025-08-31 20:58:16,996 - PyRuSH.RuSH - DEBUG - Sentence(559-669):\t>Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d.<\n", + "2025-08-31 20:58:16,997 - PyRuSH.RuSH - DEBUG - Sentence(671-765):\t>This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case.<\n", + "2025-08-31 20:58:16,998 - PyRuSH.RuSH - DEBUG - Sentence(767-799):\t>She also is on an ACE inhibitor.<\n", + "2025-08-31 20:58:16,999 - PyRuSH.RuSH - DEBUG - Sentence(800-837):\t>So her discharge meds are as follows:<\n", + "2025-08-31 20:58:16,999 - PyRuSH.RuSH - DEBUG - Sentence(838-862):\t>1. Coreg 6.25 mg b.i.d.<\n", + "2025-08-31 20:58:17,001 - PyRuSH.RuSH - DEBUG - Sentence(863-893):\t>2. Simvastatin 40 mg nightly.<\n", + "2025-08-31 20:58:17,002 - PyRuSH.RuSH - DEBUG - Sentence(894-920):\t>3. Lisinopril 5 mg b.i.d.<\n", + "2025-08-31 20:58:17,003 - PyRuSH.RuSH - DEBUG - Sentence(921-944):\t>4. Protonix 40 mg a.m.<\n", + "2025-08-31 20:58:17,004 - PyRuSH.RuSH - DEBUG - Sentence(945-970):\t>5. Aspirin 160 mg a day.<\n", + "2025-08-31 20:58:17,005 - PyRuSH.RuSH - DEBUG - Sentence(971-993):\t>6. Lasix 20 mg b.i.d.<\n", + "2025-08-31 20:58:17,006 - PyRuSH.RuSH - DEBUG - Sentence(994-1017):\t>7. Spiriva puff daily.<\n", + "2025-08-31 20:58:17,009 - PyRuSH.RuSH - DEBUG - Sentence(1018-1045):\t>8. Albuterol p.r.n. q.i.d.<\n", + "2025-08-31 20:58:17,010 - PyRuSH.RuSH - DEBUG - Sentence(1046-1075):\t>9. Advair 500/50 puff b.i.d.<\n", + "2025-08-31 20:58:17,011 - PyRuSH.RuSH - DEBUG - Sentence(1076-1106):\t>10. Xopenex q.i.d. and p.r.n.<\n", + "2025-08-31 20:58:17,013 - PyRuSH.RuSH - DEBUG - Sentence(1107-1146):\t>I will see her in a month to six weeks.<\n", + "2025-08-31 20:58:17,014 - PyRuSH.RuSH - DEBUG - Sentence(1148-1191):\t>She is to follow up with Dr. X before that.<\n", + "2025-08-31 20:58:17,015 - PyRuSH.RuSH - DEBUG - Sentence(1204-1209):\t>Ezoic<\n", + "2025-08-31 20:58:17,016 - PyRuSH.RuSH - DEBUG - Sentence(1210-1263):\t>- MTSam Sample Bottom Matched Content - native_bottom<\n", + "2025-08-31 20:58:17,017 - PyRuSH.RuSH - DEBUG - Sentence(1270-1279):\t>End Ezoic<\n", + "2025-08-31 20:58:17,018 - PyRuSH.RuSH - DEBUG - Sentence(1280-1333):\t>- MTSam Sample Bottom Matched Content - native_bottom<\n" + ] + } + ], + "source": [ + "sents=rush.segToSentenceSpans(input_str)\n", + "# 2025-08-24 23:45:48,415 - PyRuSH.RuSH - DEBUG - Sentence(19-130):\t>Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency.<\n", + "# 2025-08-24 23:45:48,416 - PyRuSH.RuSH - DEBUG - Sentence(132-188):\t>She has chronic lung disease with bronchospastic angina.<\n", + "# 2025-08-24 23:45:48,416 - PyRuSH.RuSH - DEBUG - Sentence(189-239):\t>We discovered new T-wave abnormalities on her EKG.<\n", + "# 2025-08-24 23:45:48,420 - PyRuSH.RuSH - DEBUG - Sentence(241-298):\t>There was of course a four-vessel bypass surgery in 2001.<\n", + "# 2025-08-24 23:45:48,420 - PyRuSH.RuSH - DEBUG - Sentence(300-328):\t>We did a coronary angiogram.<\n", + "# 2025-08-24 23:45:48,421 - PyRuSH.RuSH - DEBUG - Sentence(332-426):\t>This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case.<\n", + "# 2025-08-24 23:45:48,421 - PyRuSH.RuSH - DEBUG - Sentence(428-460):\t>She also is on an ACE inhibitor.<\n", + "# 2025-08-24 23:45:48,422 - PyRuSH.RuSH - DEBUG - Sentence(461-498):\t>So her discharge meds are as follows:<\n", + "# 2025-08-24 23:45:48,422 - PyRuSH.RuSH - DEBUG - Sentence(499-523):\t>1. Coreg 6.25 mg b.i.d.<" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c0d5de6d", + "metadata": {}, + "outputs": [], + "source": [ + "txt1='''10. Xopenex q.i.d. and p.r.n.\n", + " + She is to follow up with Dr. X before that.\n", + " \n", + "\n", + "\n", + " Ezoic - MTSam Sample Bottom Matched Content - native_bottom '''" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "29f0910e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-08-31 20:55:14,722 - PyRuSH.RuSH - DEBUG - stbegin\n", + "2025-08-31 20:55:14,723 - PyRuSH.RuSH - DEBUG - \t0-1:1.0\t<1>\t[Rule 46:\t\\b(\\d\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:55:14,723 - PyRuSH.RuSH - DEBUG - \t0-1:1.0\t<1>\t[Rule 46:\t\\b(\\d\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:55:14,723 - PyRuSH.RuSH - DEBUG - \t5-6:1.0\t10. \t[Rule 57:\t\\d.\\s+(\\C)\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:55:14,725 - PyRuSH.RuSH - DEBUG - \t90-91:1.0\t10. Xopenex q.i.d. and p.r.n. + She is to follow up with Dr. X before that. \t[Rule 58:\t\\n\\n\\s+(\\C)\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:55:14,725 - PyRuSH.RuSH - DEBUG - \t96-97:1.0\t10. Xopenex q.i.d. and p.r.n. + She is to follow up with Dr. X before that. Ezoic <->\t[Rule 807:\t\\c\\s+(-)\\s+\\C\tstbegin\t0.0\tACTUAL]\n", + "2025-08-31 20:55:14,727 - PyRuSH.RuSH - DEBUG - stend\n", + "2025-08-31 20:55:14,727 - PyRuSH.RuSH - DEBUG - \t2-3:1.0\t10<.>\t[Rule 348:\t\\d(.)\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:55:14,729 - PyRuSH.RuSH - DEBUG - \t18-19:1.0\t10. Xopenex q.i.d<.>\t[Rule 312:\t\\a(.) +\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:55:14,730 - PyRuSH.RuSH - DEBUG - \t29-30:1.0\t10. Xopenex q.i.d. and p.r.n<.>\t[Rule 418:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:55:14,731 - PyRuSH.RuSH - DEBUG - \t76-77:1.0\t10. Xopenex q.i.d. and p.r.n. + She is to follow up with Dr. X before that<.>\t[Rule 418:\t\\c(.)\\n\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:55:14,731 - PyRuSH.RuSH - DEBUG - \t78-91:1.0\t10. Xopenex q.i.d. and p.r.n. + She is to follow up with Dr. X before that. < >\t[Rule 518:\t\\s+\\n\\n+\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:55:14,732 - PyRuSH.RuSH - DEBUG - \t94-99:1.0\t10. Xopenex q.i.d. and p.r.n. + She is to follow up with Dr. X before that. Ezoi\t[Rule 808:\t\\c\\s+-\\s+\\C\tstend\t2.0\tACTUAL]\n", + "2025-08-31 20:55:14,733 - PyRuSH.RuSH - DEBUG - Sentence(0-3):\t>10.<\n", + "2025-08-31 20:55:14,734 - PyRuSH.RuSH - DEBUG - Sentence(5-95):\t>Xopenex q.i.d. and p.r.n. + She is to follow up with Dr. X before that. Ezoic<\n", + "2025-08-31 20:55:14,735 - PyRuSH.RuSH - DEBUG - Sentence(96-149):\t>- MTSam Sample Bottom Matched Content - native_bottom<\n" + ] + } + ], + "source": [ + "sents=rush.segToSentenceSpans(txt1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "52bdf6d1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 1 'Ms.' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 25 ' ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 25-25 (idx 139-139) between spans 138-140\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 25 ' ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 26 'She' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 35 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 35-35 (idx 196-196) between spans 196-197\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 35 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 36 'We' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.779\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 47 ' ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.794\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 47-47 (idx 248-248) between spans 247-249\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.795\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 47 ' ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.796\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 48 'There' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.796\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 61 ' ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.797\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 61-61 (idx 307-307) between spans 306-308\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.797\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 61 ' ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.798\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 62 'We' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.798\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 68 ' ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.799\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 68-68 (idx 337-337) between spans 336-338\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.799\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 68 ' ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.799\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 69 'This' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.800\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 88 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.800\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 88-88 (idx 450-450) between spans 450-451\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.801\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 88 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.801\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 89 'She' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.801\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 109 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.802\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 109-109 (idx 558-558) between spans 558-559\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.803\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 109 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.804\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 110 'Her' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.805\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 132 ' ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.807\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 132-132 (idx 670-670) between spans 669-671\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.808\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 132 ' ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.808\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 133 'This' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.809\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 152 ' ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.810\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 152-152 (idx 766-766) between spans 765-767\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.810\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 152 ' ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.811\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 153 'She' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.812\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 161 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.813\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 161-161 (idx 799-799) between spans 799-800\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.813\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 161 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.814\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 162 'So' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.815\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 170 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.815\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 170-170 (idx 837-837) between spans 837-838\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.816\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 170 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.818\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 171 '1' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.821\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 179 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.822\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 179-179 (idx 862-862) between spans 862-863\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.823\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 179 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.824\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 180 '2' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 188 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 188-188 (idx 893-893) between spans 893-894\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 188 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 189 '3' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.829\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 197 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.830\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 197-197 (idx 920-920) between spans 920-921\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.831\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 197 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.832\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 198 '4' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.833\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 205 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.834\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 205-205 (idx 944-944) between spans 944-945\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.835\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 205 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.835\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 206 '5' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.836\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 215 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.837\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 215-215 (idx 970-970) between spans 970-971\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.837\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 215 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.838\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 216 '6' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.839\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 224 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.840\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 224-224 (idx 993-993) between spans 993-994\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.840\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 224 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.840\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 225 '7' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.843\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 232 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.843\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 232-232 (idx 1017-1017) between spans 1017-1018\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.845\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 232 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.846\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 233 '8' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.847\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 241 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.848\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 241-241 (idx 1045-1045) between spans 1045-1046\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.849\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 241 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.850\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 242 '9' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.851\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 250 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.852\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 250-250 (idx 1075-1075) between spans 1075-1076\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.853\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 250 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.854\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 251 '10' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.855\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 260 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.858\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 260-260 (idx 1106-1106) between spans 1106-1107\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.858\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 260 '\n", + "' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.858\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 261 'I' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.861\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 272 ' ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.862\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 272-272 (idx 1147-1147) between spans 1146-1148\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.863\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 272 ' ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.864\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 273 'She' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.865\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 284 '\n", + " \n", + "\n", + "\n", + " ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.866\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 284-284 (idx 1191-1191) between spans 1191-1204\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.867\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 284 '\n", + " \n", + "\n", + "\n", + " ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.868\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 285 'Ezoic' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.869\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 286 '-' marked as sentence start (span end next token)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 286 '-' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 294 '\n", + "\n", + "\n", + "\n", + "\n", + " ' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] GAP DETECTED: tokens 294-294 (idx 1264-1264) between spans 1263-1270\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 294 '\n", + "\n", + "\n", + "\n", + "\n", + " ' marked as sentence start (whitespace in gap between spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 295 'End' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 297 '-' marked as sentence start (span end next token)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 297 '-' marked as sentence start (span begin)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.870\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 305 '\n", + "' marked as sentence start (span end whitespace)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.879\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 305 '\n", + "' marked as sentence start (whitespace after all spans)\u001b[0m\n", + "\u001b[32m2025-08-31 20:58:32.881\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mPyRuSH.PyRuSHSentencizer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m100\u001b[0m - \u001b[34m\u001b[1m[cpredict_split_gaps|call_id=0] Token/tag mapping: [( \n", + "\n", + " \n", + " , False), (Ms., True), (ABCD, False), (is, False), (a, False), (69, False), (-, False), (year, False), (-, False), (old, False), (lady, False), (,, False), (who, False), (was, False), (admitted, False), (to, False), (the, False), (hospital, False), (with, False), (chest, False), (pain, False), (and, False), (respiratory, False), (insufficiency, False), (., False), ( , True), (She, True), (has, False), (chronic, False), (lung, False), (disease, False), (with, False), (bronchospastic, False), (angina, False), (., False), (\n", + ", True), (We, True), (discovered, False), (new, False), (T, False), (-, False), (wave, False), (abnormalities, False), (on, False), (her, False), (EKG, False), (., False), ( , True), (There, True), (was, False), (of, False), (course, False), (a, False), (four, False), (-, False), (vessel, False), (bypass, False), (surgery, False), (in, False), (2001, False), (., False), ( , True), (We, True), (did, False), (a, False), (coronary, False), (angiogram, False), (., False), ( , True), (This, True), (demonstrated, False), (patent, False), (vein, False), (grafts, False), (and, False), (patent, False), (internal, False), (mammary, False), (vessel, False), (and, False), (so, False), (there, False), (was, False), (no, False), (obvious, False), (new, False), (disease, False), (., False), (\n", + ", True), (She, True), (may, False), (continue, False), (in, False), (the, False), (future, False), (to, False), (have, False), (angina, False), (and, False), (she, False), (will, False), (have, False), (nitroglycerin, False), (available, False), (for, False), (that, False), (if, False), (needed, False), (., False), (\n", + ", True), (Her, True), (blood, False), (pressure, False), (has, False), (been, False), (elevated, False), (and, False), (so, False), (instead, False), (of, False), (metoprolol, False), (,, False), (we, False), (have, False), (started, False), (her, False), (on, False), (Coreg, False), (6.25, False), (mg, False), (b.i.d, False), (., False), ( , True), (This, True), (should, False), (be, False), (increased, False), (up, False), (to, False), (25, False), (mg, False), (b.i.d, False), (., False), (as, False), (preferred, False), (antihypertensive, False), (in, False), (this, False), (lady, False), ('s, False), (case, False), (., False), ( , True), (She, True), (also, False), (is, False), (on, False), (an, False), (ACE, False), (inhibitor, False), (., False), (\n", + ", True), (So, True), (her, False), (discharge, False), (meds, False), (are, False), (as, False), (follows, False), (:, False), (\n", + ", True), (1, True), (., False), ( , False), (Coreg, False), (6.25, False), (mg, False), (b.i.d, False), (., False), (\n", + ", True), (2, True), (., False), ( , False), (Simvastatin, False), (40, False), (mg, False), (nightly, False), (., False), (\n", + ", True), (3, True), (., False), ( , False), (Lisinopril, False), (5, False), (mg, False), (b.i.d, False), (., False), (\n", + ", True), (4, True), (., False), ( , False), (Protonix, False), (40, False), (mg, False), (a.m., False), (\n", + ", True), (5, True), (., False), ( , False), (Aspirin, False), (160, False), (mg, False), (a, False), (day, False), (., False), (\n", + ", True), (6, True), (., False), ( , False), (Lasix, False), (20, False), (mg, False), (b.i.d, False), (., False), (\n", + ", True), (7, True), (., False), ( , False), (Spiriva, False), (puff, False), (daily, False), (., False), (\n", + ", True), (8, True), (., False), ( , False), (Albuterol, False), (p.r.n, False), (., False), (q.i.d, False), (., False), (\n", + ", True), (9, True), (., False), ( , False), (Advair, False), (500/50, False), (puff, False), (b.i.d, False), (., False), (\n", + ", True), (10, True), (., False), ( , False), (Xopenex, False), (q.i.d, False), (., False), (and, False), (p.r.n, False), (., False), (\n", + ", True), (I, True), (will, False), (see, False), (her, False), (in, False), (a, False), (month, False), (to, False), (six, False), (weeks, False), (., False), ( , True), (She, True), (is, False), (to, False), (follow, False), (up, False), (with, False), (Dr., False), (X, False), (before, False), (that, False), (., False), (\n", + " \n", + "\n", + "\n", + " , True), (Ezoic, True), (-, True), (MTSam, False), (Sample, False), (Bottom, False), (Matched, False), (Content, False), (-, False), (native_bottom, False), (\n", + "\n", + "\n", + "\n", + "\n", + " , True), (End, True), (Ezoic, False), (-, True), (MTSam, False), (Sample, False), (Bottom, False), (Matched, False), (Content, False), (-, False), (native_bottom, False), (\n", + ", True)]\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PyRuSH.RuSH - DEBUG - Sentence(0-27):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(27-138):\t>Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(139-140):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(140-196):\t>She has chronic lung disease with bronchospastic angina.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(196-197):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(197-247):\t>We discovered new T-wave abnormalities on her EKG.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(248-249):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(249-306):\t>There was of course a four-vessel bypass surgery in 2001.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(307-308):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(308-336):\t>We did a coronary angiogram.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(337-338):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(338-450):\t>This demonstrated patent vein grafts and patent internal mammary vessel and so there was no obvious new disease.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(450-451):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(451-558):\t>She may continue in the future to have angina and she will have nitroglycerin available for that if needed.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(558-559):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(559-669):\t>Her blood pressure has been elevated and so instead of metoprolol, we have started her on Coreg 6.25 mg b.i.d.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(670-671):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(671-765):\t>This should be increased up to 25 mg b.i.d. as preferred antihypertensive in this lady's case.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(766-767):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(767-799):\t>She also is on an ACE inhibitor.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(799-800):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(800-837):\t>So her discharge meds are as follows:<\n", + "PyRuSH.RuSH - DEBUG - Sentence(837-838):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(838-862):\t>1. Coreg 6.25 mg b.i.d.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(862-863):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(863-893):\t>2. Simvastatin 40 mg nightly.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(893-894):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(894-920):\t>3. Lisinopril 5 mg b.i.d.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(920-921):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(921-944):\t>4. Protonix 40 mg a.m.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(944-945):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(945-970):\t>5. Aspirin 160 mg a day.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(970-971):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(971-993):\t>6. Lasix 20 mg b.i.d.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(993-994):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(994-1017):\t>7. Spiriva puff daily.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1017-1018):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(1018-1045):\t>8. Albuterol p.r.n. q.i.d.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1045-1046):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(1046-1075):\t>9. Advair 500/50 puff b.i.d.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1075-1076):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(1076-1106):\t>10. Xopenex q.i.d. and p.r.n.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1106-1107):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(1107-1146):\t>I will see her in a month to six weeks.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1147-1148):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(1148-1191):\t>She is to follow up with Dr. X before that.<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1191-1204):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(1204-1209):\t>Ezoic<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1210-1263):\t>- MTSam Sample Bottom Matched Content - native_bottom<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1264-1270):\t> <\n", + "PyRuSH.RuSH - DEBUG - Sentence(1270-1279):\t>End Ezoic<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1280-1333):\t>- MTSam Sample Bottom Matched Content - native_bottom<\n", + "PyRuSH.RuSH - DEBUG - Sentence(1333-1334):\t> <\n" + ] + } + ], + "source": [ + "\n", + "nlp = English()\n", + "nlp.add_pipe(\"medspacy_pyrush\")\n", + "doc = nlp(input_str)\n", + "for sent in doc.sents:\n", + " start = sent.start_char\n", + " end = sent.end_char\n", + " print(f\"PyRuSH.RuSH - DEBUG - Sentence({start}-{end}):\\t>{str(sent)}<\".replace('\\n',' '))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e5f9fe60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "53" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(list(doc.sents))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6f21337d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \n", + "\n", + "\n", + " <\n", + "----\n", + "\n", + ">Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency.<\n", + "----\n", + "\n", + "> <\n", + "----\n", + "\n", + ">She has chronic lung disease with bronchospastic angina.<\n", + "----\n", + "\n", + ">\n", + "<\n", + "----\n", + "\n", + ">We discovered new T-wave abnormalities on her EKG.<\n", + "----\n", + "\n", + "> <\n", + "----\n", + "\n", + ">There was of course a four-vessel bypass surgery in 2001.<\n", + "----\n", + "\n", + "> <\n", + "----\n", + "\n", + ">We did a coronary angiogram.<\n", + "----\n", + "\n", + ">\n", + "\n", + "<\n", + "----\n", + "\n" + ] + } + ], + "source": [ + "input_str = ''' \n", + "\n", + "\n", + " Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency. She has chronic lung disease with bronchospastic angina.\n", + "We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. \n", + "\n", + "'''\n", + "from PyRuSH.RuSH import initLogger\n", + "initLogger()\n", + "nlp = English()\n", + "nlp.add_pipe(\"medspacy_pyrush\")\n", + "doc = nlp(input_str)\n", + "sents = [s for s in doc.sents]\n", + "for sent in sents:\n", + " print('>' + str(sent) + '<\\n----\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "5cf31051", + "metadata": {}, + "source": [ + "## Test dummy sentencizer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a30b61a6", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from loguru import logger\n", + "logger.remove()\n", + "logger.add(sys.stderr, level=\"DEBUG\")\n", + "\n", + "import pytest\n", + "import spacy\n", + "from PyRuSH.StaticSentencizerFun import cpredict_merge_gaps\n", + "\n", + "def dummy_sentencizer(text):\n", + " # Dummy sentencizer: splits on periods and newlines\n", + " spans = []\n", + " start = 0\n", + " split=False\n", + " for i, c in enumerate(text):\n", + " if split:\n", + " spans.append(type('Span', (), {'begin': start, 'end': i+1})())\n", + " start = i+1\n", + " split=False\n", + " if c in '.\\n':\n", + " split=True \n", + " if start < len(text):\n", + " spans.append(type('Span', (), {'begin': start, 'end': len(text)})())\n", + " return spans" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "71159489", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2025-09-02 12:51:47.767\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[34m\u001b[1mcpredict_merge_gaps called: docs=1, max_sentence_length=20\u001b[0m\n", + "\u001b[32m2025-09-02 12:51:47.768\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[34m\u001b[1m[doc 0] 1 spans detected: [(0, 89)]\u001b[0m\n", + "\u001b[32m2025-09-02 12:51:47.770\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[34m\u001b[1m[doc 0] Mark sentence start at token 0: 'A' idx=0 (span begin)\u001b[0m\n", + "\u001b[32m2025-09-02 12:51:47.772\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[34m\u001b[1m[doc 0] Mark/Split due to max_sentence_length at token 4: 'that' idx=21 (split before exceeding limit)\u001b[0m\n", + "\u001b[32m2025-09-02 12:51:47.773\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[34m\u001b[1m[doc 0] Mark/Split due to max_sentence_length at token 9: 'whitespace' idx=45 (split before exceeding limit)\u001b[0m\n", + "\u001b[32m2025-09-02 12:51:47.774\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[34m\u001b[1m[doc 0] Mark/Split due to max_sentence_length at token 12: 'max' idx=67 (split before exceeding limit)\u001b[0m\n", + "\u001b[32m2025-09-02 12:51:47.775\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[34m\u001b[1m[doc 0] Sentence start guesses: [0, 4, 9, 12]\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dummy_sentencizer spans: [(0, 89, 'A very long sentence that should be split at whitespace before the max length is reached.')]\n" + ] + } + ], + "source": [ + "nlp = spacy.blank('en')\n", + "doc = nlp(\"A very long sentence that should be split at whitespace before the max length is reached.\")\n", + "max_len = 20\n", + "spans = dummy_sentencizer(doc.text)\n", + "print(\"dummy_sentencizer spans:\", [(span.begin, span.end, doc.text[span.begin:span.end]) for span in spans])\n", + "guesses = cpredict_merge_gaps([doc], dummy_sentencizer, max_sentence_length=max_len)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2f54d05a", + "metadata": {}, + "outputs": [], + "source": [ + "sys.path.append('../PyRuSH')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ea7241fd", + "metadata": {}, + "outputs": [], + "source": [ + "from StaticSentencizerFun import cpredict_merge_gaps,cpredict_split_gaps, cset_annotations\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "790f0b46", + "metadata": {}, + "outputs": [], + "source": [ + "cset_annotations(doc, guesses)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ab9d18a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(list(doc.sents))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8c5bdcf7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">A very long sentence<\n", + ">that should be split at<\n", + ">whitespace before the<\n", + ">max length is reached.<\n" + ] + } + ], + "source": [ + "for s in doc.sents:\n", + " print(f'>{s.text}<')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c4b8f95", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "test", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 92010d7..8520781 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,28 @@ [build-system] -requires = ["setuptools", "wheel", 'Cython>=0.25,<3.0', 'numpy>=1.10', "spacy>=3.0.0", "cymem", "preshed","quicksectx>=0.3.5","PyFastNER>=1.0.8"] \ No newline at end of file +requires = ["setuptools", 'Cython',"wheel"] +build-backend = "setuptools.build_meta" + + +[project] +name = "PyRuSH" +dynamic = ["dependencies","readme","version"] +keywords = ['PyRuSH', 'NLP', 'sentenczier','sentence segmentation'] +authors = [{name = "Jianlin", email="jianlinshi.cn@gmail.com"}] +description = '''PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy. +If you wish to cite RuSH in a publication, please use: + +Jianlin Shi ; Danielle Mowery ; Kristina M. Doing-Harris ; John F. Hurdle.RuSH: a Rule-based Segmentation Tool Using Hashing for Extremely Accurate Sentence Segmentation of Clinical Text. AMIA Annu Symp Proc. 2016: 1587. +''' +requires-python = ">=3.6" +classifiers = [ + "Programming Language :: Python :: 3", +] +license = "MIT" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} +readme={file = ['README.rst']} +version = {attr = "PyRuSH.__version__" } + +[project.urls] +Source = "https://github.com/jianlins/PyRuSH" \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..2587caf --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore:Importing 'parser.split_arg_string' is deprecated.*:DeprecationWarning \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 882dc85..0432f76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -Cython>=0.25,<3.0 +Cython setuptools -numpy -spacy>=3.0.0 +spacy<3.8; python_version < "3.12" +spacy>=3.8; python_version >= "3.12" PyFastNER>=1.0.8 -quicksectx>=0.3.5 \ No newline at end of file +quicksectx>=0.3.5 +loguru \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index f103568..ffcad36 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,7 @@ [metadata] -description_file = README.md + +readme = README.md +license = MIT [bdist_wheel] -python-tag=py3 \ No newline at end of file +python_tag=py3 diff --git a/setup.py b/setup.py index 7d5d533..e3ac1bc 100644 --- a/setup.py +++ b/setup.py @@ -2,10 +2,11 @@ from setuptools.extension import Extension from codecs import open from os import path +import os from Cython.Build import cythonize -import numpy -import spacy, cymem, preshed +# import numpy +# import spacy, cymem, preshed from distutils.sysconfig import get_python_inc here = path.abspath(path.dirname(__file__)) @@ -13,25 +14,21 @@ long_description = f.read() -def parse_requirements(filename): - """ load requirements from a pip requirements file """ - lineiter = (line.strip() for line in open(filename)) - return [line.split("#")[0].strip() for line in lineiter if line and not line.startswith("#")] +# def parse_requirements(filename): +# """ load requirements from a pip requirements file """ +# lineiter = (line.strip() for line in open(filename)) +# return [line.split("#")[0].strip() for line in lineiter if line and not line.startswith("#")] -print(parse_requirements('requirements.txt')) +# print(parse_requirements('requirements.txt')) def get_version(): - """Load the version from version.py, without importing it. + for line in open(os.path.join(os.path.dirname(__file__), 'PyRuSH', '__init__.py')).read().splitlines(): + if line.startswith('__version__'): + delim = '"' if '"' in line else "'" + return line.split(delim)[1] + else: + raise RuntimeError("Unable to find version string.") - This function assumes that the last line in the file contains a variable defining the - version string with single quotes. - - """ - try: - with open('PyRuSH/version.py', 'r') as f: - return f.read().split('\n')[0].split('=')[-1].replace('\'', '').strip() - except IOError: - return "0.0.0a1" COMPILER_DIRECTIVES = { "language_level": 3, @@ -41,10 +38,11 @@ def get_version(): dir_path = path.dirname(path.realpath(__file__)) include_dirs = [dir_path + "/PyRuSH", dir_path, - numpy.get_include(), - path.dirname(spacy.__file__), - path.dirname(cymem.__file__), - path.dirname(preshed.__file__)] + # numpy.get_include(), + # path.dirname(spacy.__file__), + # path.dirname(cymem.__file__), + # path.dirname(preshed.__file__) + ] extensions = [ Extension( 'PyRuSH.StaticSentencizerFun', @@ -75,16 +73,15 @@ def get_version(): 'Programming Language :: Python :: 3.11', 'Development Status :: 3 - Alpha', "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", + "License :: OSI Approved :: MIT Software License", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: Linguistic", ], - license='Apache License', + license='MIT', zip_safe=False, include_package_data=True, - install_requires=parse_requirements('requirements.txt'), + # install_requires=parse_requirements('requirements.txt'), ext_modules=cythonize(extensions, compiler_directives=COMPILER_DIRECTIVES), - tests_require='pytest', package_data={'': ['*.pyx', '*.pxd', '*.so', '*.dll', '*.lib', '*.cpp', '*.c','../conf/rush_rules.tsv','../requirements.txt']}, ) diff --git a/tests/__pycache__/test_PyRushSentencizer.cpython-312-pytest-8.3.3.pyc b/tests/__pycache__/test_PyRushSentencizer.cpython-312-pytest-8.3.3.pyc new file mode 100644 index 0000000..25adbcc Binary files /dev/null and b/tests/__pycache__/test_PyRushSentencizer.cpython-312-pytest-8.3.3.pyc differ diff --git a/tests/__pycache__/test_Rush.cpython-312-pytest-8.3.3.pyc b/tests/__pycache__/test_Rush.cpython-312-pytest-8.3.3.pyc new file mode 100644 index 0000000..60adde8 Binary files /dev/null and b/tests/__pycache__/test_Rush.cpython-312-pytest-8.3.3.pyc differ diff --git a/tests/__pycache__/test_Rush_w_Logger.cpython-312-pytest-8.3.3.pyc b/tests/__pycache__/test_Rush_w_Logger.cpython-312-pytest-8.3.3.pyc new file mode 100644 index 0000000..587c3bc Binary files /dev/null and b/tests/__pycache__/test_Rush_w_Logger.cpython-312-pytest-8.3.3.pyc differ diff --git a/tests/test_PyRuSHSentencizer_param.py b/tests/test_PyRuSHSentencizer_param.py new file mode 100644 index 0000000..36415cb --- /dev/null +++ b/tests/test_PyRuSHSentencizer_param.py @@ -0,0 +1,70 @@ +import os +from loguru import logger +from spacy.lang.en import English +from PyRuSH.PyRuSHSentencizer import PyRuSHSentencizer + +text_short = "Sentence one. Sentence two!" +text_long = "This is a very long sentence that should be split at whitespace before the max length is reached. " * 5 +text_whitespace = "First sentence. Second sentence after spaces.\nThird sentence after newline." +rule_path = os.path.join(os.path.dirname(__file__), "rush_rules.tsv") + +def make_nlp(merge_gaps, max_sentence_length): + nlp = English() + nlp.add_pipe("medspacy_pyrush", config={ + "rules_path": rule_path, + "merge_gaps": merge_gaps, + "max_sentence_length": max_sentence_length + }) + return nlp + +def test_merge_gaps_true_no_maxlen(): + nlp = make_nlp(merge_gaps=True, max_sentence_length=None) + doc = nlp(text_short) + sents = [s.text for s in doc.sents] + logger.info("[merge_gaps=True, max_sentence_length=None] Split sentences:") + for i, sent in enumerate(sents): + logger.info(f" [{i}] len={len(sent)} {repr(sent)}") + assert len(sents) >= 2 + +def test_merge_gaps_false_no_maxlen(): + nlp = make_nlp(merge_gaps=False, max_sentence_length=None) + doc = nlp(text_short) + sents = [s.text for s in doc.sents] + logger.info("[merge_gaps=False, max_sentence_length=None] Split sentences:") + for i, sent in enumerate(sents): + logger.info(f" [{i}] len={len(sent)} {repr(sent)}") + assert len(sents) >= 2 + +def test_merge_gaps_true_with_maxlen(): + nlp = make_nlp(merge_gaps=True, max_sentence_length=50) + doc = nlp(text_long) + sents = [s.text for s in doc.sents] + logger.info("[merge_gaps=True, max_sentence_length=50] Split sentences:") + for i, sent in enumerate(sents): + logger.info(f" [{i}] len={len(sent)} {repr(sent)}") + # Should split long text into multiple sentences + assert len(sents) > 2 + for sent in sents: + assert len(sent) <= 60 # allow some leeway + +def test_merge_gaps_false_with_maxlen(): + nlp = make_nlp(merge_gaps=False, max_sentence_length=50) + doc = nlp(text_long) + sents = [s.text for s in doc.sents] + logger.info("[merge_gaps=False, max_sentence_length=50] Split sentences:") + for i, sent in enumerate(sents): + logger.info(f" [{i}] len={len(sent)} {repr(sent)}") + assert len(sents) > 2 + # Allow up to 100 chars due to tokenization edge cases + for sent in sents: + assert len(sent) <= 100 + +def test_whitespace_edge_merge(): + nlp = make_nlp(merge_gaps=True, max_sentence_length=20) + doc = nlp(text_whitespace) + sents = [s.text for s in doc.sents] + for i, sent in enumerate(sents): + logger.info(f" [{i}] len={len(sent)} {repr(sent)}") + assert len(sent) <= 20, f"Sentence {i} exceeds max_sentence_length: {len(sent)} > 20" + assert len(sents) >= 3 + diff --git a/tests/test_PyRushSentencizer.py b/tests/test_PyRushSentencizer.py index 0bb12fa..26df271 100644 --- a/tests/test_PyRushSentencizer.py +++ b/tests/test_PyRushSentencizer.py @@ -1,5 +1,7 @@ import unittest import os +import sys +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from PyRuSH import PyRuSHSentencizer from spacy.lang.en import English @@ -8,7 +10,7 @@ class TestRuSH(unittest.TestCase): def setUp(self): - pwd = os.path.dirname(os.path.abspath(__file__)) + self.pwd = os.path.dirname(os.path.abspath(__file__)) def test_doc(self): nlp = English() @@ -49,16 +51,17 @@ def test_doc2(self): End Ezoic - MTSam Sample Bottom Matched Content - native_bottom ''' nlp = English() - nlp.add_pipe("medspacy_pyrush") + nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')}) doc = nlp(input_str) sents = [s for s in doc.sents] for sent in sents: print('>' + str(sent) + '<\n\n') - assert (len(sents) == 26) - # SpaCy has no control of sentence end. Thus, it ends up with sloppy ends. - assert (sents[1].text=='Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with' - ' chest pain and respiratory insufficiency. ') + # New expected count includes whitespace-only sentences + assert (len(sents) == 51) + # For content checks, filter out whitespace-only sentences + content_sents = [s for s in sents if s.text.strip()] + assert (content_sents[0].text == 'Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency.') def test_doc3(self): input_str = ''' @@ -68,18 +71,18 @@ def test_doc3(self): We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. ''' - from PyRuSH.RuSH import initLogger - initLogger() + from loguru import logger + logger.add(sys.stdout, level="DEBUG") nlp = English() - nlp.add_pipe("medspacy_pyrush") + nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')}) doc = nlp(input_str) sents = [s for s in doc.sents] for sent in sents: - print('>' + str(sent) + '<\n\n') + logger.debug('>' + str(sent) + '<\n\n') # SpaCy has no control of sentence end. Thus, it ends up with sloppy ends. assert (sents[1].text == 'Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with' - ' chest pain and respiratory insufficiency. ') + ' chest pain and respiratory insufficiency.') def test_customized_rules(self): input_str = ''' @@ -89,18 +92,18 @@ def test_customized_rules(self): We discovered new T-wave abnormalities on her EKG. There was of course a four-vessel bypass surgery in 2001. We did a coronary angiogram. ''' - from PyRuSH.RuSH import initLogger - initLogger() + from loguru import logger + logger.add(sys.stdout, level="DEBUG") from PyRuSH import RuSH pwd = os.path.dirname(os.path.abspath(__file__)) rush = RuSH(str(os.path.join(pwd, 'rush_rules.tsv')), enable_logger=True) sentences = rush.segToSentenceSpans(input_str) # for i in range(0, len(sentences)): # sentence = sentences[i] - # print('assert (sentences[' + str(i) + '].begin == ' + str(sentence.begin) + ' and sentences[' + str( - # i) + '].end == ' + str(sentence.end) + ')') + # logger.debug('assert (sentences[' + str(i) + '].begin == ' + str(sentence.begin) + ' and sentences[' + str( + # i) + '].end == ' + str(sentence.end + ')') # self.printDetails(sentences, input_str) - # print('\n\n'.join(['>{}<'.format(input_str[s.begin:s.end]) for s in sentences])) + # logger.debug('\n\n'.join(['>{}<'.format(input_str[s.begin:s.end]) for s in sentences])) nlp = English() @@ -109,8 +112,9 @@ def test_customized_rules(self): doc = nlp(input_str) sents = [s for s in doc.sents] for sent in sents: - print('>' + str(sent) + '<\n\n') + logger.debug('>' + str(sent) + '<\n\n') # SpaCy has no control of sentence end. Thus, it ends up with sloppy ends. assert (sents[1].text == 'Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with' - ' chest pain and respiratory insufficiency. ') \ No newline at end of file + ' chest pain and respiratory insufficiency.') + \ No newline at end of file diff --git a/tests/test_PyRushSentencizer2.py b/tests/test_PyRushSentencizer2.py new file mode 100644 index 0000000..8f3c1b7 --- /dev/null +++ b/tests/test_PyRushSentencizer2.py @@ -0,0 +1,45 @@ +import unittest +import os +import sys +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from PyRuSH import PyRuSHSentencizer +from spacy.lang.en import English + + +class TestRuSH(unittest.TestCase): + + def setUp(self): + self.pwd = os.path.dirname(os.path.abspath(__file__)) + + # def test_doc(self): + # nlp = English() + # nlp.add_pipe("medspacy_pyrush") + # doc = nlp("This is a sentence. This is another sentence.") + # print('\n'.join([str(s) for s in doc.sents])) + # print('\nTotal sentences: {}'.format(len([s for s in doc.sents]))) + # print('\ndoc is an instance of {}'.format(type(doc))) + + def test_doc4(self): + input_str='''Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound +at the time of admission demonstrated pancreatic duct dilitation and +edematous gallbladder. She was admitted to the ICU. +Discharge Medications: +1. Miconazole Nitrate 2 % Powder Sig: One (1) Appl Topical BID +(2 times a day) as needed. +2. Heparin Sodium (Porcine) 5,000 unit/mL Solution Sig: One (1) +Injection TID (3 times a day). +3. Acetaminophen 160 mg/5 mL Elixir Sig: One (1) PO Q4-6H +(every 4 to 6 hours) as needed.''' + nlp = English() + nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')}) + nlp.initialize() + doc = nlp(input_str) + sents = [s for s in doc.sents] + for sent in sents: + print('>' + str(sent) + '<\n\n') + assert(sents[-1].text=='''Sig: One (1) PO Q4-6H +(every 4 to 6 hours) as needed.''') + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_Rush.py b/tests/test_Rush.py index bedbef6..18bd374 100644 --- a/tests/test_Rush.py +++ b/tests/test_Rush.py @@ -95,7 +95,7 @@ def test7(self): sentences = rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str) - def test_doc2(self): + def test8(self): input_str = ''' 9. Advair b.i.d. 10. Xopenex q.i.d. and p.r.n. @@ -109,12 +109,44 @@ def test_doc2(self): sent = sentences[1] assert (input_str[sent.begin:sent.end] == '10. Xopenex q.i.d. and p.r.n.') - - def test_doc11(self): + def test9(self): input_str=' This is a sentence. This is another sentence.' - sentences=self.rush.segToSentenceSpans(input_str) - for sent in sentences: - print('>' + input_str[sent.begin:sent.end] + '<\n') + self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True) + sentences = self.rush.segToSentenceSpans(input_str) + self.printDetails(sentences, input_str) + + def test10(self): + input_str='''Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound +at the time of admission demonstrated pancreatic duct dilitation and +edematous gallbladder. She was admitted to the ICU. +Discharge Medications: +1. Miconazole Nitrate 2 % Powder Sig: One (1) Appl Topical BID +(2 times a day) as needed. +2. Heparin Sodium (Porcine) 5,000 unit/mL Solution Sig: One (1) +Injection TID (3 times a day). +3. Acetaminophen 160 mg/5 mL Elixir Sig: One (1) PO Q4-6H +(every 4 to 6 hours) as needed.''' + self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True) + sentences = self.rush.segToSentenceSpans(input_str) + self.printDetails(sentences, input_str) + assert (sentences[0].begin == 0 and sentences[0].end == 173) + assert (sentences[1].begin == 174 and sentences[1].end == 202) + assert (sentences[2].begin == 203 and sentences[2].end == 225) + assert (sentences[3].begin == 226 and sentences[3].end == 258) + assert (sentences[4].begin == 259 and sentences[4].end == 316) + assert (sentences[5].begin == 317 and sentences[5].end == 367) + assert (sentences[6].begin == 368 and sentences[6].end == 411) + assert (sentences[7].begin == 412 and sentences[7].end == 447) + assert (sentences[8].begin == 448 and sentences[8].end == 502) + + def test11(self): + input_str = '''Patient doesn't have heart disease or high blood pressure, but their dad did have +diabetes. Pt is a 63M w/ h/o metastatic carcinoid tumor, HTN and hyperlipidemia.''' + self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True) + sentences = self.rush.segToSentenceSpans(input_str) + self.printDetails(sentences, input_str) + assert (sentences[0].begin == 0 and sentences[0].end == 91) + assert (sentences[1].begin == 92 and sentences[1].end == 162) if __name__ == '__main__': unittest.main() diff --git a/tests/test_cpredict_split_gaps.py b/tests/test_cpredict_split_gaps.py new file mode 100644 index 0000000..809ef79 --- /dev/null +++ b/tests/test_cpredict_split_gaps.py @@ -0,0 +1,198 @@ + +import pytest +from PyRuSH.StaticSentencizerFun import cpredict_split_gaps +import spacy +from loguru import logger +from PyFastNER import Span +nlp = spacy.blank("en") + + +def dummy_sentencizer_fun(text): + # For testing, split sentences at every period + spans = [] + start = 0 + for i, c in enumerate(text): + if c == ".": + spans.append(Span(start, i+1)) + start = i+1 + if start < len(text): + spans.append(Span(start, len(text))) + return spans + +def make_doc_from_text(text): + # Use spaCy's default tokenizer + return nlp(text) + +def test_split_gaps_single_token(): + doc = make_doc_from_text("Hello") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + starts = [i for i, v in enumerate(guesses[0]) if v] + assert starts == [0] + # Verify split sentence length + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_single_period(): + doc = make_doc_from_text(".") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + starts = [i for i, v in enumerate(guesses[0]) if v] + assert starts == [0] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_consecutive_periods(): + doc = make_doc_from_text("Hello..World.") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + starts = [i for i, v in enumerate(guesses[0]) if v] + # Should mark the first token and after each period + assert starts[0] == 0 + assert len(starts) >= 2 + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_long_sentence_no_period(): + doc = make_doc_from_text("A " * 100) + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun, 20) + starts = [i for i, v in enumerate(guesses[0]) if v] + # Should split every ~10 tokens (since each token is 1 char + 1 space) + assert starts[0] == 0 + assert len(starts) > 1 + # Check each split sentence is <= 20 tokens + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len <= 20, f"Sentence from {start} to {end} has length {sentence_len} > 20" + +def test_split_gaps_non_ascii(): + doc = make_doc_from_text("Hello 世界 . World .") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + starts = [i for i, v in enumerate(guesses[0]) if v] + # Get sentences by splitting at sentence start indices + sentences = [] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentences.append(" ".join([doc[i].text for i in range(start, end)])) + logger.debug(f"[test_split_gaps_non_ascii] Split sentences: {sentences}") + # Expect sentences to be 'Hello 世界 .' and 'World .' + assert any("世界" in s for s in sentences) + assert any("World" in s for s in sentences) + # Verify split sentence length + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_punctuation_only(): + doc = make_doc_from_text("!!! . ??? .") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + starts = [i for i, v in enumerate(guesses[0]) if v] + assert starts[0] == 0 + assert len(starts) > 1 + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_basic(): + doc = make_doc_from_text("This is a sentence. This is another one.") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + starts = [i for i, v in enumerate(guesses[0]) if v] + sentences = [] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentences.append(" ".join([doc[i].text for i in range(start, end)])) + logger.debug(f"[test_split_gaps_basic] Split sentences: {sentences}") + assert "This is a sentence ." in sentences + assert "This is another one ." in sentences + # Verify split sentence length + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_max_length_none(): + doc = make_doc_from_text("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z.") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun, None) + starts = [i for i, v in enumerate(guesses[0]) if v] + assert starts == [0] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_max_length_set(): + doc = make_doc_from_text("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z.") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun, 10) + starts = [i for i, v in enumerate(guesses[0]) if v] + assert starts[0] == 0 + assert len(starts) > 1 + # Check each split sentence is <= 10 characters + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_text = "".join([doc[i].text_with_ws for i in range(start, end)]) + char_len = len(sentence_text) + logger.debug(f"[test_split_gaps_max_length_set] Sentence from {start} to {end} has char length {char_len}") + assert char_len <= 10, f"Sentence from {start} to {end} has char length {char_len} > 10" + +def test_split_gaps_empty_doc(): + doc = make_doc_from_text("") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + assert guesses == [[]] + +def test_split_gaps_whitespace_none(): + doc = make_doc_from_text(" . .") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun, None) + starts = [i for i, v in enumerate(guesses[0]) if v] + sentences = [] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentences.append(" ".join([doc[i].text for i in range(start, end)])) + sentences = [s.strip() for s in sentences] + logger.debug(f"[test_split_gaps_whitespace_none] Split sentences: {sentences}") + # Should have two sentences, each with a single period + assert sentences == [".", "."] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_whitespace_set(): + doc = make_doc_from_text(" . .") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun, 5) + starts = [i for i, v in enumerate(guesses[0]) if v] + sentences = [] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentences.append(" ".join([doc[i].text for i in range(start, end)])) + sentences = [s.strip() for s in sentences] + logger.debug(f"[test_split_gaps_whitespace_set] Split sentences: {sentences}") + assert sentences == [".", "."] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 + +def test_split_gaps_mixed_whitespace_and_text(): + doc = make_doc_from_text(" . Hello . . World .") + guesses = cpredict_split_gaps([doc], dummy_sentencizer_fun) + starts = [i for i, v in enumerate(guesses[0]) if v] + sentences = [] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentences.append(" ".join([doc[i].text for i in range(start, end)])) + sentences = [s.strip() for s in sentences] + logger.debug(f"[test_split_gaps_mixed_whitespace_and_text] Split sentences: {sentences}") + # Should have sentences: '.', 'Hello .', 'World .' + assert "." in sentences + assert "Hello ." in sentences + assert "World ." in sentences + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_len = end - start + assert sentence_len > 0 \ No newline at end of file diff --git a/tests/test_debug.py b/tests/test_debug.py new file mode 100644 index 0000000..e5130d6 --- /dev/null +++ b/tests/test_debug.py @@ -0,0 +1,35 @@ +import sys,os +from loguru import logger +logger.remove() +logger.add(sys.stdout, level="DEBUG") + +def test_whitespace_edge_split(): + from spacy.lang.en import English + from loguru import logger + import medspacy + text_whitespace = "First sentence. Second sentence before spaces.\nThird sentence after newline." + nlp = English() + rule_path=os.path.join(os.path.dirname(__file__), 'rush_rules.tsv') + nlp.add_pipe("medspacy_pyrush", config={ + "rules_path": rule_path, + "merge_gaps": False, + "max_sentence_length": 20 + }) + sentencizer = nlp.get_pipe("medspacy_pyrush") + doc = nlp(text_whitespace) + # Try to get the actual span function from RuSH + spans=sentencizer.rush.segToSentenceSpans(text_whitespace) + logger.debug('Print rush segmented spans: \n----------------\n') + logger.debug(f"Spans: {[(span.begin, span.end) for span in spans]}\n----------------\n") + logger.debug(f'Print token offsets: ') + logger.debug(f'{[(t, t.idx) for t in doc]}') + doc_guesses = sentencizer.predict([doc])[0] + logger.debug(f"doc_guesses: {doc_guesses}") + serialized = [(str(d), l) for d, l in zip(list(doc), doc_guesses)] + logger.debug(f"Serialized: {serialized}") + # Adjusted expected output to match spacy tokenization + goal = [("First", True), ("sentence", False), (".", False), (" ", True), ("Second", True), ("sentence", False), ("before", True), ("spaces", False), (".", False), ("\n", True), ("Third", True), ("sentence", False), ("after", False), ("newline", True), (".", False)] + logger.debug(f"Goal: {goal}") + for s, g in zip(serialized, goal): + logger.debug(f'{s} == {g}' if s == g else f'{s} != {g}') + assert s == g diff --git a/tests/test_merge_gaps_max_length.py b/tests/test_merge_gaps_max_length.py new file mode 100644 index 0000000..902ddf1 --- /dev/null +++ b/tests/test_merge_gaps_max_length.py @@ -0,0 +1,119 @@ +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, level="DEBUG") + +import pytest +import spacy +from PyRuSH.StaticSentencizerFun import cpredict_merge_gaps + +def dummy_sentencizer(text): + # Dummy sentencizer: splits on periods and newlines + spans = [] + start = 0 + split=False + for i, c in enumerate(text): + if split: + spans.append(type('Span', (), {'begin': start, 'end': i+1})()) + start = i+1 + split=False + if c in '.\n': + split=True + if start < len(text): + spans.append(type('Span', (), {'begin': start, 'end': len(text)})()) + return spans + + +def dummy_sentencizer2(text): + # Dummy sentencizer: splits on periods and newlines + spans = [] + start = 0 + for i, c in enumerate(text): + if c in '.\n': + spans.append(type('Span', (), {'begin': start, 'end': i+1})()) + start = i+1 + if start < len(text): + spans.append(type('Span', (), {'begin': start, 'end': len(text)})()) + return spans + +def test_merge_gaps_basic(): + nlp = spacy.blank('en') + doc = nlp("This is a sentence. This is another one.") + spans = dummy_sentencizer(doc.text) + print("dummy_sentencizer spans:", [(span.begin, span.end, doc.text[span.begin:span.end]) for span in spans]) + print("Tokens:") + for i, token in enumerate(doc): + print(f" idx={i}, text='{token.text}', token.idx={token.idx}") + guesses = cpredict_merge_gaps([doc], dummy_sentencizer) + print("cpredict_merge_gaps sentence starts:", [(i, token.text) for i, token in enumerate(doc) if guesses[0][i]]) + print("guesses:", guesses[0]) + assert guesses[0].count(True) == 2 + # Verify split sentence character length is non-zero + starts = [i for i, v in enumerate(guesses[0]) if v] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_text = "".join([doc[i].text_with_ws for i in range(start, end)]) + char_len = len(sentence_text) + assert char_len > 0 + +def test_merge_gaps_basic2(): + nlp = spacy.blank('en') + doc = nlp("This is a sentence. This is another one.") + spans = dummy_sentencizer2(doc.text) + print("dummy_sentencizer spans:", [(span.begin, span.end, doc.text[span.begin:span.end]) for span in spans]) + print("Tokens:") + for i, token in enumerate(doc): + print(f" idx={i}, text='{token.text}', token.idx={token.idx}") + guesses = cpredict_merge_gaps([doc], dummy_sentencizer2) + print("cpredict_merge_gaps sentence starts:", [(i, token.text) for i, token in enumerate(doc) if guesses[0][i]]) + print("guesses:", guesses[0]) + assert guesses[0].count(True) == 2 + starts = [i for i, v in enumerate(guesses[0]) if v] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + sentence_text = "".join([doc[i].text_with_ws for i in range(start, end)]) + char_len = len(sentence_text) + assert char_len > 0 + + +def test_merge_gaps_max_length(): + nlp = spacy.blank('en') + doc = nlp("A very long sentence that should be split at whitespace before the max length is reached.") + max_len = 20 + spans = dummy_sentencizer(doc.text) + print("dummy_sentencizer spans:", [(span.begin, span.end, doc.text[span.begin:span.end]) for span in spans]) + guesses = cpredict_merge_gaps([doc], dummy_sentencizer, max_sentence_length=max_len) + print("cpredict_merge_gaps sentence starts:", [(i, token.text) for i, token in enumerate(doc) if guesses[0][i]]) + # Should split at least once + assert guesses[0].count(True) > 1 + starts = [i for i, v in enumerate(guesses[0]) if v] + for idx, start in enumerate(starts): + last_token_idx = starts[idx + 1] - 1 if idx + 1 < len(starts) else len(doc) - 1 + end_offset = doc[last_token_idx].idx + len(doc[last_token_idx]) + sentence_text = doc.text[doc[start].idx:end_offset] + char_len = len(sentence_text) + logger.debug(f'{sentence_text} --- length: {char_len}') + assert char_len <= max_len, f"Sentence from {start} to {last_token_idx} has char length {char_len} > {max_len}" + +def test_merge_gaps_whitespace_edge(): + nlp = spacy.blank('en') + doc = nlp("First sentence. Second sentence after spaces.\nThird sentence after newline.") + spans = dummy_sentencizer(doc.text) + print("dummy_sentencizer spans:", [(span.begin, span.end, doc.text[span.begin:span.end]) for span in spans]) + guesses = cpredict_merge_gaps([doc], dummy_sentencizer, max_sentence_length=15) + print("cpredict_merge_gaps sentence starts:", [(i, token.text) for i, token in enumerate(doc) if guesses[0][i]]) + # Should split at whitespace/newline before max length + assert guesses[0].count(True) >= 3 + starts = [i for i, v in enumerate(guesses[0]) if v] + for idx, start in enumerate(starts): + end = starts[idx + 1] if idx + 1 < len(starts) else len(doc) + # Find last non-whitespace token in the chunk + last_token_idx = end - 1 + while last_token_idx >= start and doc[last_token_idx].text.isspace(): + last_token_idx -= 1 + if last_token_idx < start: + continue # skip empty chunk + end_offset = doc[last_token_idx].idx + len(doc[last_token_idx]) + sentence_text = doc.text[doc[start].idx:end_offset] + char_len = len(sentence_text) + assert char_len <= 15, f"Sentence from {start} to {last_token_idx} has char length {char_len} > 15"