diff --git a/allofplos/article.py b/allofplos/article.py index bad63c3b..264563ca 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -9,8 +9,9 @@ from . import get_corpus_dir from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX, - URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path) -from .plos_regex import validate_doi + URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path, + partial_to_doi) +from .plos_regex import validate_doi, find_valid_partial_dois from .elements import (parse_article_date, get_contrib_info, Journal, License, match_contribs_to_dicts) from .utils import dedent @@ -124,6 +125,11 @@ def doi(self): """ return self._doi + @property + def partial_doi(self): + """Convert a DOI to a partial DOI.""" + return self.doi.lstrip('10.1371/').replace('journal.', '') + @property def text_viewer(self): """Command line application for viewing text to be used with @@ -1336,3 +1342,15 @@ def from_filename(cls, filename): else: directory = None return cls(filename_to_doi(filename), directory=directory) + + @classmethod + def from_partial_doi(cls, partial_doi, directory=None): + """Initiate an article object using a partial DOI. + Uses regex to make sure it's a valid partial DOI. + Used for internal PLOS methods. + """ + if directory is None: + directory = get_corpus_dir() + doi = partial_to_doi(partial_doi) + + return cls(doi, directory=directory) diff --git a/allofplos/corpus/corpus.py b/allofplos/corpus/corpus.py index fc176ef3..dbd3ec90 100644 --- a/allofplos/corpus/corpus.py +++ b/allofplos/corpus/corpus.py @@ -5,7 +5,8 @@ from itertools import islice from .. import get_corpus_dir, Article -from ..transformations import filename_to_doi, doi_to_path +from ..transformations import filename_to_doi, doi_to_path, partial_to_doi +from ..plos_regex import validate_doi, validate_partial_doi class Corpus: @@ -36,13 +37,16 @@ def __iter__(self): return (article for article in self.random_article_generator) def __getitem__(self, key): - if isinstance(key, int): return Article(self.dois[key], directory=self.directory) elif isinstance(key, slice): return (Article(doi, directory=self.directory) for doi in self.dois[key]) elif key not in self.dois: + if partial_to_doi(key) in self.dois: + return Article.from_partial_doi(key, directory=self.directory) + elif validate_partial_doi(key): + key = partial_to_doi(key) path= doi_to_path(key, directory=self.directory) raise IndexError(("You attempted get {doi} from " "the corpus at \n{directory}. \n" diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index 0939d613..9a586764 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -22,6 +22,10 @@ full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") +partial_doi_regex_search = re.compile(r"p[a-zA-Z]{3}\.[\d]{7}" + "|annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") +partial_doi_regex_match = re.compile(r"^p[a-zA-Z]{3}\.[\d]{7}$" + r"|^annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$") currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) file_regex_match = re.compile(regex_file_search+r"\.xml") BASE_URL = 'http://journals.plos.org/plosone/article/file?id=' @@ -41,6 +45,14 @@ def validate_doi(doi): return bool(full_doi_regex_match.search(doi)) +def validate_partial_doi(partial_doi): + """For an individual string, tests whether the full string is in a valid PLOS partial DOI format. + Example: 'pbio.2000777' is True, but '10.1371/journal.pbio.2000777' is False + :return: True if string is in valid PLOS partial DOI format; False if not + """ + return bool(partial_doi_regex_match.search(partial_doi)) + + def validate_filename(filename): """ For an individual string, tests whether the full string is in a valid article file. This can take two forms. @@ -75,6 +87,15 @@ def find_valid_dois(doi): return full_doi_regex_search.findall(doi) +def find_valid_partial_dois(doi): + """ + For an individual string, searches for any valid partial PLOS DOIs within it and returns them + Used for finding DOIs in PLOS job tickets + :return: list of valid PLOS partial DOIs contained within string + """ + return partial_doi_regex_search.findall(doi) + + def show_invalid_dois(doi_list): """ Checks to see whether a list of PLOS DOIs follow the correct format. Used mainly to determine diff --git a/allofplos/tests/test_partial_dois.py b/allofplos/tests/test_partial_dois.py new file mode 100644 index 00000000..b750fd2c --- /dev/null +++ b/allofplos/tests/test_partial_dois.py @@ -0,0 +1,52 @@ +from .. import Corpus, Article, starterdir +from ..plos_regex import validate_partial_doi, validate_doi +from ..transformations import partial_to_doi, doi_to_partial + +import pytest + + +@pytest.fixture +def corpus(): + return Corpus(starterdir, seed=1000) + + +@pytest.fixture +def test_article(): + return Article('10.1371/journal.pone.0040259', directory=starterdir) + + +@pytest.fixture +def test_doi(): + return '10.1371/journal.pone.0040259' + + +@pytest.fixture +def test_partial_doi(): + return 'pone.0040259' + + +def test_partial_doi_regex(test_partial_doi): + assert validate_partial_doi(test_partial_doi) + assert not validate_partial_doi(' pone.0040259') + assert not validate_partial_doi('pone.0040259 ') + + +def test_partial_doi_transform(test_doi, test_partial_doi): + partial_doi = doi_to_partial(test_doi) + assert partial_doi == test_partial_doi + + +def test_doi_transform(test_partial_doi, test_doi): + doi = partial_to_doi(test_partial_doi) + assert validate_doi(doi) + assert doi == test_doi + + +def test_partial_doi_method_article(test_partial_doi, test_article): + article = Article.from_partial_doi(test_partial_doi, directory=starterdir) + assert article == test_article + + +def test_partial_doi_method_corpus(corpus, test_article, test_partial_doi): + article = corpus[test_partial_doi] + assert article == test_article diff --git a/allofplos/transformations.py b/allofplos/transformations.py index 38de9608..c116e765 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -5,7 +5,7 @@ from . import get_corpus_dir -from .plos_regex import validate_filename, validate_doi +from .plos_regex import validate_filename, validate_doi, validate_partial_doi from .elements import Journal # URL bases for PLOS's Solr instances, that index PLOS articles @@ -183,6 +183,22 @@ def doi_to_path(doi, directory=None): article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER) return article_file +def partial_to_doi(partial_doi): + """Convert a partial DOI into a DOI.""" + if validate_partial_doi(partial_doi) is False: + raise Exception("Invalid format for PLOS partial DOI: {}".format(partial_doi)) + if partial_doi.startswith('annotation'): + doi = PREFIX + partial_doi + else: + doi = ''.join([PREFIX, 'journal.', partial_doi]) + return doi + +def doi_to_partial(doi): + """Convert a DOI into a partial DOI.""" + if validate_doi(doi) is False: + raise Exception("Invalid format for PLOS DOI: {}".format(doi)) + return doi.lstrip('10.1371/').replace('journal.', '') + def convert_country(country): """