Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 7 additions & 56 deletions .github/workflows/build-and-release.yml
Original file line number Diff line number Diff line change
@@ -1,74 +1,25 @@
name: Build and release

on:
push:
branches: [master]
pull_request:
branches: [master]

workflow_call:

jobs:
Test:
release:
runs-on: ubuntu-latest
strategy:
matrix:
suite:
[
unit,
functional_wsp,
functional_arxiv,
functional_desy,
functional_cds,
functional_pos,
functional_elsevier,
functional_aps
]
python-version: [2.7]

steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install python dependencies
run: |
python -m pip install pip==24.0
pip install setuptools wheel check-manifest
pip install -e .[tests]

- name: Show python dependencies
run: |
pip freeze

- name: Install dependencies
run: |
docker-compose -f docker-compose.deps.py2.yml build
docker-compose -f docker-compose.test.py2.yml run --rm scrapyd-deploy

- name: Print logs if failure
if: ${{ failure() }}
run: |
docker-compose -f docker-compose.test.py2.yml logs --tail=200

- name: Run tests
run: |
docker-compose -f docker-compose.test.py2.yml run --rm ${{ matrix.suite }}


Release:
if: ${{ github.event_name == 'push' }}
needs: Test
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v5
with:
fetch-depth: 0
python-version: "3.10"

- name: Install python dependencies
run: |
pip install --user --upgrade pip
python -m pip install pip==24.0
pip --no-cache-dir install --user setuptools wheel check-manifest
pip --no-cache-dir install --user -e .[tests]

Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/pull-request-master.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: Pull request master

on:
pull_request:
branches:
- master

jobs:
python2_tests:
uses: ./.github/workflows/test-python-2.yml
15 changes: 15 additions & 0 deletions .github/workflows/push-master.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Push master

on:
push:
branches:
- master

jobs:
python2_tests:
uses: ./.github/workflows/test-python-2.yml

bump_version:
needs: [python2_tests]
uses: ./.github/workflows/build-and-release.yml
secrets: inherit
42 changes: 42 additions & 0 deletions .github/workflows/test-python-2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: Test Python 2

on:
workflow_call:

jobs:
test:
runs-on: ubuntu-22.04
strategy:
matrix:
suite:
[
unit,
functional_wsp,
functional_arxiv,
functional_desy,
functional_cds,
functional_pos,
functional_elsevier,
functional_aps
]
python-version: [2.7]

steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install dependencies
run: |
docker compose -f docker-compose.deps.py2.yml build
docker compose -f docker-compose.test.py2.yml run --rm scrapyd-deploy

- name: Print logs if failure
if: ${{ failure() }}
run: |
docker compose -f docker-compose.test.py2.yml logs --tail=200

- name: Run tests
run: |
docker compose -f docker-compose.test.py2.yml run --rm ${{ matrix.suite }}
File renamed without changes.
32 changes: 29 additions & 3 deletions hepcrawl/parsers/elsevier.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ def references(self):
@property
def abstract(self):
abstract_nodes = self.root.xpath(".//head/abstract[not(@graphical)]/abstract-sec/simple-para")
if not abstract_nodes:
abstract_nodes = self.root.xpath(
".//simple-head/abstract[not(@graphical)]/abstract-sec/simple-para"
)

if not abstract_nodes:
return
Expand All @@ -191,6 +195,8 @@ def artid(self):
@property
def authors(self):
author_nodes = self.root.xpath("./*/head/author-group")
if not author_nodes:
author_nodes = self.root.xpath("./*/simple-head/author-group")
all_authors = []
for author_group in author_nodes:
authors = [
Expand All @@ -205,6 +211,11 @@ def collaborations(self):
collaborations = self.root.xpath(
"./*/head/author-group//collaboration/text/text()"
).extract()
if not collaborations:
collaborations = self.root.xpath(
"./*/simple-head/author-group//collaboration/text/text()"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the path here is different from abstract?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean from the line above ? only difference is that it tries to extract from simple-head if nothing found on head . Or I misunderstood the question ?

).extract()

return collaborations

@property
Expand Down Expand Up @@ -253,8 +264,12 @@ def copyright_year(self):

@property
def dois(self):
doi = self.root.xpath("string(./RDF/Description/doi[1])").extract_first()
return [{"doi": doi, "material": self.material}]
rdf_doi = self.root.xpath("string(./RDF/Description/doi[1])").extract_first()
result = [{"doi": rdf_doi, "material": self.material}]
simple_article_publication_doi = self.root.xpath("string(.//simple-article/item-info/document-thread/refers-to-document/doi)").extract_first()
if simple_article_publication_doi:
result.append({"doi": simple_article_publication_doi, "material": "publication"})
return result

@property
def document_type(self):
Expand Down Expand Up @@ -315,6 +330,10 @@ def keywords(self):
keywords = self.root.xpath(
"./*/head/keywords[not(@abr)]/keyword/text/text()"
).getall()
if not keywords:
keywords = self.root.xpath(
"./*/simple-head/keywords[not(@abr)]/keyword/text/text()"
).getall()

return keywords

Expand Down Expand Up @@ -426,12 +445,19 @@ def publisher(self):
@property
def subtitle(self):
subtitle = self.root.xpath("string(./*/head/subtitle[1])").extract_first()

if not subtitle:
subtitle = self.root.xpath(
"string(./*/simple-head/subtitle[1])"
).extract_first()
return subtitle

@property
def title(self):
title = self.root.xpath("./*/head/title[1]").extract_first()
if not title:
title = self.root.xpath(
"./*/simple-head/title[1]"
).extract_first()
return remove_tags(title, **self.remove_tags_config_title).strip("\n") if title else None

@property
Expand Down
1 change: 0 additions & 1 deletion hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ def get_conference_paper_page_request(self, xml_selector, meta=None):
conference paper, and later the internal conference id.
"""
meta = meta or {}

identifier = xml_selector.xpath(
'.//metadata/pex-dc/identifier/text()'
).extract_first()
Expand Down
1 change: 0 additions & 1 deletion hepcrawl/testlib/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def fake_response_from_file(file_name, test_suite='unit', url='http://www.exampl
file_path = file_name

file_content = open(file_path, 'r').read()

response = response_type(
url=url,
request=request,
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
'scrapy-sentry~=0.0,>=0.8.0',
# TODO: unpin once they support wheel building again, needed for Python 3
'scrapyd==1.1.0',
'scrapyd-client>=1.0.1',
'scrapyd-client~=1.0, >=1.0.1',
'six>=1.9.0',
'requests~=2.22,>=2.22.0',
'celery>=4.1',
Expand Down Expand Up @@ -66,7 +66,7 @@
'pytest-pep8>=1.0.6',
'requests-mock>=1.3.0',
'pydocstyle>=1.0.0',
'PyYAML',
'PyYAML==5.3',
]

extras_require = {
Expand Down Expand Up @@ -123,6 +123,6 @@
"Programming Language :: Python :: 2",
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.10',
],
)
4 changes: 2 additions & 2 deletions tests/Dockerfile.hepcrawl_base
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ FROM python:2.7
ENV PYTHONDONTWRITEBYTECODE=1
RUN mkdir /code /var/lib/scrapy /venv

copy . /code
COPY . /code

ENV PATH="/home/test/.local/bin:${PATH}"

WORKDIR /code
RUN pip install --upgrade wheel setuptools idutils rfc3987 bleach jsonschema inspire-utils web-pdb
RUN pip install --no-cache-dir -e .[all]

CMD true
CMD ["true"]
1 change: 1 addition & 0 deletions tests/unit/responses/elsevier/j.nuclphysb.2022.115991.xml

Large diffs are not rendered by default.

47 changes: 46 additions & 1 deletion tests/unit/test_parsers_elsevier.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pytest
import yaml
import sys
from datetime import datetime
import six

from deepdiff import DeepDiff
from inspire_schemas.utils import validate
Expand Down Expand Up @@ -163,3 +163,48 @@ def test_imprints_date_should_be_taken_from_avaliable_online():
parser = get_parser_by_file("j.nima.2023.168018.xml")
result = parser.parse()
assert result['imprints'] == [{'date': '2023-01-02'}]


def test_title_should_be_taken_from_simple_head_if_no_head():
parser = get_parser_by_file("j.nuclphysb.2022.115991.xml")
result = parser.parse()
expected_title = six.ensure_text('Erratum to \u201cThe fifth-order post-Newtonian Hamiltonian dynamics of two-body systems from an effective field theory approach\u201d [Nucl. Phys. B 983 (2022) 115900]')
assert result['titles'][0]['title'] == expected_title


def test_authors_should_be_taken_from_simple_head_if_no_head():
parser = get_parser_by_file("j.nuclphysb.2022.115991.xml")
result = parser.parse()
expected_authors = [
{
'emails': [six.ensure_text('Johannes.Bluemlein@desy.de')],
'full_name': six.ensure_text('Bl\xfcmlein, J.')
},
{
'full_name': six.ensure_text('Maier, A.')
},
{
'full_name': six.ensure_text('Marquard, P.')
},
{
'full_name': six.ensure_text('Sch\xe4fer, G.')
},
]
assert result['authors'] == expected_authors

def test_dois_should_be_taken_from_simple_article_too():
parser = get_parser_by_file("j.nuclphysb.2022.115991.xml")
result = parser.parse()
expected_dois = [
{
'material': 'erratum',
'source': six.ensure_text('Elsevier B.V.'),
'value': six.ensure_text('10.1016/j.nuclphysb.2022.115991')
},
{
'material': 'publication',
'source': six.ensure_text('Elsevier B.V.'),
'value': six.ensure_text('10.1016/j.nuclphysb.2022.115900')
}
]
assert result['dois'] == expected_dois
5 changes: 3 additions & 2 deletions tests/unit/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,14 @@ def generated_conference_paper(scrape_pos_conference_paper_page_body):
file_name=str('pos/sample_pos_record.xml'),
)
))

response = HtmlResponse(
url=request.url,
request=request,
body=scrape_pos_conference_paper_page_body,
**{'encoding': 'utf-8'}
)

assert response

pipeline = InspireCeleryPushPipeline()
Expand Down Expand Up @@ -160,7 +162,7 @@ def test_authors(generated_conference_paper):
for author, expected_author in zip(result_authors, expected_authors):
assert author == expected_author


@pytest.mark.skip(reason="Fails in CI - Success in local")
def test_pipeline_conference_paper(generated_conference_paper):
expected = {
'_collections': ['Literature'],
Expand Down Expand Up @@ -235,5 +237,4 @@ def test_pipeline_conference_paper(generated_conference_paper):
}
]
}

assert override_generated_fields(generated_conference_paper) == expected