georgetown-cset
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/pythonapp.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pythonapp.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 5 deletions b/‎README.md‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎linkage_dag.py‎
Lines changed: 4 additions & 4 deletions b/‎linkage_dag.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎requirements.txt‎
Lines changed: 8 additions & 47 deletions b/‎requirements.txt‎
Lines changed: 8 additions & 47 deletions
diff --git a/‎sequences/generate_wos_metadata.tsv‎
Lines changed: 0 additions & 6 deletions b/‎sequences/generate_wos_metadata.tsv‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎sql/ids_to_drop.sql‎
Lines changed: 1 addition & 1 deletion b/‎sql/ids_to_drop.sql‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/union_ids.sql‎
Lines changed: 0 additions & 2 deletions b/‎sql/union_ids.sql‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎sql/union_metadata.sql‎
Lines changed: 0 additions & 11 deletions b/‎sql/union_metadata.sql‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎sql/wos_abstracts.sql‎
Lines changed: 0 additions & 6 deletions b/‎sql/wos_abstracts.sql‎
Lines changed: 0 additions & 6 deletions
@@ -9,10 +9,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.7
+    - name: Set up Python 3.9
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.9
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
 
@@ -9,10 +9,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.7
+    - name: Set up Python 3.9
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.9
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
 
@@ -4,7 +4,7 @@
 At CSET, we aim to produce a more comprehensive set of scholarly literature by ingesting multiple sources and then
 deduplicating articles. This repository contains CSET's current method of cross-dataset article linking. Note that we
 use "article" very loosely, although in a way that to our knowledge is fairly consistent across the datasets we draw
-from. Books, for example, are included. We currently include articles from arXiv, Web of Science, Papers With Code,
+from. Books, for example, are included. We currently include articles from arXiv, Papers With Code,
 Semantic Scholar, The Lens, and OpenAlex. Some of these sources are largely duplicative (e.g. arXiv is well covered by
 other corpora) but are included to aid in linking to additional metadata (e.g. arXiv fulltext).
 
@@ -15,12 +15,11 @@ article linkage, see the [ETO documentation](https://eto.tech/dataset-docs/mac/)
 
 To match articles, we need to extract the data that we want to use in matching and put it in a consistent format. The
 SQL queries specified in the `sequences/generate_{dataset}_data.tsv` files are run in the order they appear in those
-files. For OpenAlex we exclude documents with a `type` of Dataset, Peer Review, or Grant. Additionally, we take every
-combination of the Web of Science titles, abstracts, and pubyear so that a match on any of these combinations will
-result in a match on the shared WOS id. Finally, for Semantic Scholar, we exclude any documents that have a non-null
+files. For OpenAlex we exclude documents with a `type` of Dataset, Peer Review, or Grant. Finally, for Semantic Scholar,
+we exclude any documents that have a non-null
 publication type that is one of Dataset, Editorial, LettersAndComments, News, or Review.
 
-For each article in arXiv, Web of Science, Papers With Code, Semantic Scholar, The Lens, and OpenAlex
+For each article in arXiv, Papers With Code, Semantic Scholar, The Lens, and OpenAlex
 we [normalized](utils/clean_corpus.py) titles, abstracts, and author last names to remove whitespace, punctuation,
 and other artifacts thought to not be useful for linking. For the purpose of matching, we filtered out titles,
 abstracts, and DOIs that occurred more than 10 times in the corpus. We then considered each group of articles
 
@@ -46,7 +46,7 @@
 
 production_dataset = "literature"
 staging_dataset = f"staging_{production_dataset}"
-args = get_default_args(pocs=["Jennifer"])
+args = get_default_args(pocs=["James"])
 args["retries"] = 1
 
 with DAG(
@@ -79,7 +79,7 @@
     # standard format
     metadata_sequences_start = []
     metadata_sequences_end = []
-    for dataset in ["arxiv", "wos", "papers_with_code", "openalex", "s2", "lens"]:
+    for dataset in ["arxiv", "papers_with_code", "openalex", "s2", "lens"]:
         ds_commands = []
         query_list = [
             t.strip()
@@ -407,12 +407,12 @@
 
     prep_environment = BashOperator(
         task_id="prep_environment",
-        bash_command=f'gcloud compute ssh jm3312@{gce_resource_id} --zone {GCP_ZONE} --command "{prep_environment_vm_script}"',
+        bash_command=f'gcloud compute ssh airflow@{gce_resource_id} --zone {GCP_ZONE} --command "{prep_environment_vm_script}"',
     )
 
     create_cset_ids = BashOperator(
         task_id="create_cset_ids",
-        bash_command=f'gcloud compute ssh jm3312@{gce_resource_id} --zone {GCP_ZONE} --command "bash run_ids_scripts.sh &> log &"',
+        bash_command=f'gcloud compute ssh airflow@{gce_resource_id} --zone {GCP_ZONE} --command "bash run_ids_scripts.sh &> log &"',
         inlets=[
             BigQueryTable(
                 project_id=project_id, dataset_id=production_dataset, table_id="sources"
 
@@ -1,49 +1,10 @@
-apache-beam[gcp]>2.19.0
-attrs==19.3.0
-avro-python3==1.9.2.1
-cachetools==3.1.1
-certifi==2019.11.28
-chardet==3.0.4
-crcmod==1.7
-dill==0.3.1.1
-docopt==0.6.2
-docutils==0.15.2
-fastavro==0.21.24
-fasteners==0.15
-future==0.18.2
-gensim==3.8.1
-google-cloud-bigquery>=1.17.1
-hdfs==2.5.8
-idna==2.9
-importlib-metadata==1.5.0
-jmespath==0.9.5
-mock==2.0.0
-monotonic==1.5
-more-itertools==8.2.0
-packaging==20.1
-pbr==5.4.4
-pluggy==0.13.1
-py>=1.10.0
-pyarrow==0.15.1
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pycld2==0.41
-pydot==1.4.1
-pymongo==3.10.1
-pyparsing==2.4.6
-pytest==5.3.5
-python-dateutil==2.8.1
-pytz==2019.3
-requests==2.23.0
-rsa>=4.7
-s3transfer==0.3.3
-scipy==1.4.1
-six==1.14.0
-smart-open==1.9.0
-tqdm==4.43.0
-typing==3.7.4.1
-typing-extensions==3.7.4.1
-wcwidth==0.1.8
-zipp==3.0.0
+apache-beam[gcp]
+chardet
+gensim
+google-cloud-bigquery
+pycld2
+requests
+tqdm
 pre-commit
 coverage
+pytest
@@ -2,4 +2,4 @@ SELECT DISTINCT merged_id
 FROM
   literature.sources
 WHERE
-  orig_id IN (SELECT id1 FROM staging_literature.unlink)
+  orig_id IN (SELECT id1 FROM {{ staging_dataset }}.unlink)
@@ -1,8 +1,6 @@
 -- glue all the ids together (used in validation)
 SELECT id FROM {{ staging_dataset }}.arxiv_ids
 UNION ALL
-SELECT id FROM {{ staging_dataset }}.wos_ids
-UNION ALL
 SELECT id FROM {{ staging_dataset }}.papers_with_code_ids
 UNION ALL
 SELECT id FROM {{ staging_dataset }}.openalex_ids
 
@@ -11,17 +11,6 @@ WITH meta AS (
     "arxiv" AS dataset
   FROM {{ staging_dataset }}.arxiv_metadata
   UNION ALL
-  SELECT
-    cast(id AS STRING) AS id,
-    title,
-    abstract,
-    clean_doi,
-    cast(year AS INT64) AS year,
-    last_names,
-    references,
-    "wos" AS dataset
-  FROM {{ staging_dataset }}.wos_metadata
-  UNION ALL
   SELECT
     cast(id AS STRING) AS id,
     title,