From a3041cc9fd9d94434921bdb1e42bb498c9cd975c Mon Sep 17 00:00:00 2001 From: Spencer Axelrod Date: Fri, 9 Sep 2022 11:12:10 -0500 Subject: [PATCH 1/4] initial commit, based on existing release_automation scripts --- gen3/cli/discovery.py | 9 ++ gen3/tools/metadata/discovery.py | 207 +++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+) diff --git a/gen3/cli/discovery.py b/gen3/cli/discovery.py index 44d15ca0d..ddae7a7ce 100644 --- a/gen3/cli/discovery.py +++ b/gen3/cli/discovery.py @@ -3,6 +3,7 @@ import click from gen3.tools.metadata.discovery import ( + generate_discovery_metadata, publish_discovery_metadata, output_expanded_discovery_metadata, try_delete_discovery_guid, @@ -94,6 +95,14 @@ def discovery_delete(ctx, guid): try_delete_discovery_guid(auth, guid) +@click.command() +@click.pass_context +def discovery_generate(ctx): + auth = ctx.obj["auth_factory"].get() + endpoint = ctx.obj.get("endpoint") + generate_discovery_metadata(auth, endpoint) + + discovery.add_command(discovery_read, name="read") discovery.add_command(discovery_publish, name="publish") discovery.add_command(discovery_delete, name="delete") diff --git a/gen3/tools/metadata/discovery.py b/gen3/tools/metadata/discovery.py index 951a8b9c0..c6f7f96c3 100644 --- a/gen3/tools/metadata/discovery.py +++ b/gen3/tools/metadata/discovery.py @@ -5,11 +5,13 @@ import tempfile import asyncio import os +from bs4 import BeautifulSoup from urllib.parse import urlparse import requests.exceptions from gen3.metadata import Gen3Metadata +from gen3.submission import Gen3Submission from gen3.tools import metadata from gen3.utils import raise_for_status_and_print_error @@ -25,6 +27,80 @@ logging = get_logger("__name__") +def generate_discovery_metadata(auth, endpoint): + """ + Get discovery metadata from dbgap for currently submitted studies in a commons + """ + submission = Gen3Submission(endpoint, auth_provider=auth) + query_txt = """ +{ + project(first:0) { + project_id + code + name + studies(first:0) { + study_id + dbgap_phs + dbgap_consent + dbgap_version + dbgap_accession + dbgap_consent_text + dbgap_participant_set + authz + full_name + short_name + study_description + _subjects_count + } + } +} + """ + raw_results = submission.query(query_txt).get("data", {}).get("project", []) + results = [] + fields = set() + + for raw_result in raw_results: + studies = raw_result.get("studies") + study_data = {} + if len(studies) != 1: + logging.warning( + f"expect 1:1 project:study, got {studies} from {raw_result}" + ) + else: + study_data = studies[0] + + del raw_result["studies"] + result = copy.deepcopy(raw_result) + result.update(study_data) + + if "authz" in result: + result["authz"] = str(result["authz"]).replace("'", '"') + + result["tags"] = _determine_tags_from_study_info(result) + result["study_description"] = _get_study_description(result) + + # don't include studies with no subjects for now, this effectively removes + # any projects that were created but have no data submitted + if result.get("_subjects_count"): + results.append(result) + + fields = fields | set(result.keys()) + output_filepath = _dbgap_file_from_auth(auth) + + with open(output_filepath, "w+", encoding="utf-8") as output_file: + logging.info(f"writing headers to {output_filepath}: {fields}") + output_writer = csv.DictWriter( + output_file, + delimiter="\t", + fieldnames=fields, + extrasaction="ignore", + ) + output_writer.writeheader() + + for row in results: + output_writer.writerow(row) + + async def output_expanded_discovery_metadata( auth, endpoint=None, limit=500, use_agg_mds=False ): @@ -254,3 +330,134 @@ def _metadata_file_from_auth(auth): return ( "-".join(urlparse(auth.endpoint).netloc.split(".")) + "-discovery_metadata.tsv" ) + + +def _dbgap_file_from_auth(auth): + return "-".join(urlparse(auth.endpoint).netloc.split(".")) + "-dbgap_metadata.tsv" + + +def _determine_tags_from_study_info(study): + tags = [] + if study.get("project_id", "") and study.get("project_id", "").startswith("parent"): + tags.append(_get_tag("Parent", "Program")) + tags.append(_get_tag("DCC Harmonized", "Data Type")) + tags.append(_get_tag("Clinical Phenotype", "Data Type")) + + if study.get("project_id", "") and study.get("project_id", "").startswith("topmed"): + tags.append(_get_tag("TOPMed", "Program")) + tags.append(_get_tag("Genotype", "Data Type")) + + if _is_topmed_study_geno_and_pheno(study.get("code", "")): + tags.append(_get_tag("Clinical Phenotype", "Data Type")) + + if study.get("project_id", "") and study.get("project_id", "").startswith("COVID"): + tags.append(_get_tag("COVID 19", "Program")) + + if study.get("dbgap_accession", "") and study.get("dbgap_accession", "").startswith( + "phs" + ): + tags.append(_get_tag("dbGaP", "Study Registration")) + + return str(tags).replace("'", '"') + + +def _get_tag(name, category): + return {"name": name, "category": category} + + +def _is_topmed_study_geno_and_pheno(study): + # if the topmed study has both gennomic and phenotype data (instead of having a parent + # study with pheno and a topmed with geno separately) + # + # determined from https://docs.google.com/spreadsheets/d/1iVOmZVu_IzsVMdefH-1Rgf8zrjqvnZOUEA2dxS5iRjc/edit#gid=698119570 + # filter to "program"=="topmed" and "parent_study_accession"=="" + return study in [ + "SAGE_DS-LD-IRB-COL", + "Amish_HMB-IRB-MDS", + "CRA_DS-ASTHMA-IRB-MDS-RD", + "VAFAR_HMB-IRB", + "PARTNERS_HMB", + "WGHS_HMB", + "BAGS_GRU-IRB", + "Sarcoidosis_DS-SAR-IRB", + "HyperGEN_GRU-IRB", + "HyperGEN_DS-CVD-IRB-RD", + "THRV_DS-CVD-IRB-COL-NPU-RD", + "miRhythm_GRU", + "AustralianFamilialAF_HMB-NPU-MDS", + "pharmHU_HMB", + "pharmHU_DS-SCD-RD", + "pharmHU_DS-SCD", + "SAPPHIRE_asthma_DS-ASTHMA-IRB-COL", + "REDS-III_Brazil_SCD_GRU-IRB-PUB-NPU", + "Walk_PHaSST_SCD_HMB-IRB-PUB-COL-NPU-MDS-GSO", + "Walk_PHaSST_SCD_DS-SCD-IRB-PUB-COL-NPU-MDS-RD", + "MLOF_HMB-PUB", + "AFLMU_HMB-IRB-PUB-COL-NPU-MDS", + "MPP_HMB-NPU-MDS", + "INSPIRE_AF_DS-MULTIPLE_DISEASES-MDS", + "DECAF_GRU", + "GENAF_HMB-NPU", + "JHU_AF_HMB-NPU-MDS", + "ChildrensHS_GAP_GRU", + "ChildrensHS_IGERA_GRU", + "ChildrensHS_MetaAir_GRU", + "CHIRAH_DS-ASTHMA-IRB-COL", + "EGCUT_GRU", + "IPF_DS-PUL-ILD-IRB-NPU", + "IPF_DS-LD-IRB-NPU", + "IPF_DS-PFIB-IRB-NPU", + "IPF_HMB-IRB-NPU", + "IPF_DS-ILD-IRB-NPU", + "OMG_SCD_DS-SCD-IRB-PUB-COL-MDS-RD", + "BioVU_AF_HMB-GSO", + "LTRC_HMB-MDS", + "PUSH_SCD_DS-SCD-IRB-PUB-COL", + "GGAF_GRU", + "PIMA_DS-ASTHMA-IRB-COL", + "CARE_BADGER_DS-ASTHMA-IRB-COL", + "CARE_TREXA_DS-ASTHMA-IRB-COL", + ] + + +def _get_study_description(study): + dbgap_phs = study.get("dbgap_phs", "") or "" + dbgap_version = study.get("dbgap_version", "") or "" + dbgap_participant_set = study.get("dbgap_participant_set", "") or "" + dbgap_study = f"{dbgap_phs}.{dbgap_version}.{dbgap_participant_set}" + + study_description = study.get("study_description") + if dbgap_study != "..": + DBGAP_WEBSITE = ( + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=" + ) + url = DBGAP_WEBSITE + dbgap_study + + logging.debug(f"scraping {url}") + page = requests.get(url) + soup = BeautifulSoup(page.content, "html.parser") + + report = soup.find("dl", class_="report") + if report: + study_description_start = report.find("dt") + + # sometimes the study description isn't the first "dd" tag + if "Study Description" not in study_description_start.getText(): + study_description_start = study_description_start.find_next_sibling( + "dt" + ) + + study_description = study_description_start.find_next_sibling("dd") or "" + + if study_description: + links = study_description.find(id="important-links") + if links: + links.decompose() + + study_description = ( + study_description.getText().strip().replace("\t", " ") + + f"\n\nNOTE: This text was scraped from https://www.ncbi.nlm.nih.gov/ on {date.today()} and may not include exact formatting or images." + ) + logging.debug(f"{study_description}") + + return study_description From 481b06449055433bb686f17f43b0a89d3a0734d3 Mon Sep 17 00:00:00 2001 From: SpencerAxelrod Date: Fri, 9 Sep 2022 16:22:37 +0000 Subject: [PATCH 2/4] Apply automatic documentation changes --- docs/_build/doctrees/environment.pickle | Bin 317584 -> 317584 bytes docs/_build/doctrees/tools/indexing.doctree | Bin 97019 -> 97019 bytes docs/_build/doctrees/tools/metadata.doctree | Bin 35856 -> 35846 bytes docs/_build/html/searchindex.js | 2 +- docs/_build/html/tools/indexing.html | 2 +- docs/_build/html/tools/metadata.html | 2 +- 6 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index 9af6dcf6898e658196bbb3e916882c2419bfbacd..27507b35f8a881caec204fe07f09df53e7db1d1d 100644 GIT binary patch delta 5708 zcmbtYc~F#Bw)cMBX%?l~rCFM_!zMI~h#MdTkX=A<8$|@^5I);RX*y0u31%|oOg}#> z=;@a)l}Su8GnvVIB@t|tU{G-DsF78c*0_MlOftzVI`!)Pk(b1G?)|#Kq%u`gHUFG@ zmftz|o_o%{xBKhp_U-8Q?b1Y+$ob}aqeW1*HaF{yNxOBebq#f`br!+3$*fra?;FVzkkOe$)ovMEkPc zg~u@wuqMt`*1-?KS22Mw9peqMST)tTb+9}!0PKzNgQKyrphD~A7$4x`ePMr$JESbh zfYWh4a3#hY(KK@>dfws#`dByEpW=oHx*HUc9G~!n?~{nzPax70L;eYhZk#jD4^c~# z;-DqP2WO)jDxm`59|?INkM#wLen%S!X(>yfUYh^|ss4}^>jR-l{$Ncx4A+x=lwQuQ zG_^MAjW8G!0P{gc2PSy~i$Whfwb*=Bed`SZ(aAYyqGWTn?Z?20HTDh`9$^lbPI>L^ZGs)G9E z+3;3+6xhn5;0xq~kuO=XnM|@-Y%6wvW)*s_+y+6*qTs;R5cq6mHvAT&scSW`YgHvX zko+661q)aw^QS3C`#O=A1Qw#IH1a{M`5wGUWx{{lNQ4Xmub`8!}~mIlOp8>rW) zp)GS0TwWFcU#u+zw`>hm7uewS5(BnH1pHTFAs9+E;9q2e;PP~+C|rSU(ga~ZcY|$3IB2R0;rR>=bX3{kn`>Ki^`<6$gLDAU zi1!goV(yt~cZhV-c_YniUZM%kOEk}UiKaU*(X8hsn*6-vxDFlj(KS@1}Y zwn5a$Y$m~@QQusT+3aj>(c!^xhU9Ns>+1Oj~dHc6+*qLSKOfqRbZ2q|)nX)R6 zpLHk0)@$#@%8A|jj}w28mD4sbvo0{R1k9`l%&Y~>tOLxf0nDuaW>$VPTfdo=$INPP zW|^6>@?Uh(>pQEY9b3g)*R9uKG>^sE<Wl>2&pd-p?=bq=nH9*Z zd-Ejp9wEf6h-s`2yd)T`MY4QiT67~ReUQof^K0h(3leoZ%4_qaE zJ7z1zK)v@|jPE|kLhsDJOr-n!@{kVpWg{hLb0oi`aIh}{yjm8)!Lu%C_|;iD-PN;e zB=hTYxg9jRK*on2Xe|DaW^&}iR5ApE9~L5Y>(5lMOZLA(*&hvE{c`dAG;xI&`xTPD z)1mi4@TLANNoi%u#B)@ceJ%rO+qqR_3_dvb4ATEQSE68d_N8Oz{ULoI1V;9GI19rJ zHe9+2PM(jEaMy6pRgf{T7=a4#8c<@8uz>=}ID&a-;q?KfB#uMQpjNz_zy&yVY7D*} z$U%pU!3v~D2D6af7+i-m_`+JG^%wGyzIP#8^8b>B3%N+=SaLBP4@mvRT%^Chm_{bx zlZ!b>)t6Q&*i-Wzgk#B6?UBxAn1Ht~WszyPcc~QVlFMZ->@`~GcI{)a9&0KTjm%|n`bInC;Z{`$k^Eu{RkH`B( ze?1$Yu+HnHNWZ@REPc*C-lAX^_eO}gktaSrEc1rG?u8J3V;frA7`=U?9QpX64M-0T z)gZk$v{mxj1&{3hP(7TE%EQAOCFNeG{ATzWRIV9u#NWr1%F#3*W7E%c&V{k5f95dJ zwUv+3_?@FQQmB`iX=Wl4I*xik_1I!beTb>=k0n9Rn3tq7J5+1XJ#2gn(ig_7kq(a+ zAywU6FTq}8usb)CkG=Mkvy^NRzjhchB;0Wn@gK}YB0BG$*eof3$1pP!`7X@m4{p%i z697}<)9~EnA_STyHz6IGtd`(iPlYCyD(` z?3~t;DY!FTreHrrF0uLFwa36NdzhrMIaF&A_ZNE=((GG}l4*p6xNvI;D*t+`SW-?h z{F}EmJ_Q?!@|(S}aou42ry_=8p*xeGO_wH^-nt5+0(s%CVlVSM$-Ug(Z z_n&iNKdjx{ft>e)!Takad66V1&g4n5rz8tAB?^Z3f$Uj7cr;UlK~~SMAa-b-U4``I z>>8whpUqXUAL)_M{UCm!e(p}N!|n%J2srUzCDK1V$Z}yYtvh)3qh zTZ;R!!2me;sRkY1{d6@Rw%JeB*bR@7PJ#L}N>?+w1?jML-)BFm2&V73;8B2ouXFGN zgm$N?MQ>q0!Gu6K-W{x9_DEy{VL<2ZBAL{XbSjLD_ikE7(!9y2D6A#H?1dIWA)+yi z<448ewIl*9579zKdlwZDBTq)egWW{KULhd}hI@is@y7|BKMsWuG3aBNhf@=Unx`dX z7#R3){HZAxpTdwVhCiG??TPHsQ$&r!$x|ds!Un)gX9DFkrdZj9GmnVgRw6y%m``?Z zuw?st7YRjMla)l_aKwtMXX{Xj5uLtSoTg+nBpXffUJBKkQWb_ulA$Ys3qtAR1TKy}hZrb)&>uBkiCkO=eVYX@ zy5=&+*DP(Lu2uS2Bwmfl&(B>sn1X>!i3>kogwT zV1mEXSXZytX&W3!#89K}_l$e?v@{v8{>R%fKLI3&m-ar%dX%uiaP*Dqv2 zAyNou$l8V$W3A&mC%R-Ekv;8P>cEs*#lW8WCS&UlfX9!>BFJR#;1SvAIx^LJ^p0#_ zbs&8=1g{32UWbRMPOqaEBiaZ2iSt)W@?LuKj-2KY8{NU{^UcKMw<=4yE z1&zfWI-%zH;oh(M<*x)0ySRT^9>^|b0c$?1lGq^ddJqyw?^)>}Y%*50Hn-F@>a_IE zwu^tAmWPD?n7L&)nyH&Vw4Ypze|E(+c6l&Pv(+vSTTCq`!L<=PTuZ;~wpvW4dhP!M D_0Byr delta 5690 zcmbU_X;hR~mcFklRe-W@#V$}95Ga8tF6=N%}ZlDc$ZR|QEQ`6j)*>litTX5liS^Yci}cLPP5OpcEs{rL zeZ9f7e80Z2s-~*33I&W|k=#s8)%)vArn-6~8dWtlzGRUU2Tf?Is;?IHdmEuXO6%(4 zG_-a`B`I3Qg#`=pvh5ciU;pFS3*yim_2)a|;=mmBP3fpO=%o6E?dQbdIZCx>TAZGv zJdeAIlXH}JibfonqXL5r;@BLuLdX$s=ggBAWzjxI<^1kjacYiwA^Cf8bdK6HvsIj! zqxOcS!pBQf)ijZJEHRMt>=Mw%1iQAf@F^!aMjP15!beK-)ajPj>W$AWrv-wIIt?s| z^+q{O=;`ubCHYVt^hndB9zRO(g-5ZjusGfgB|Z=k_bf_j+E16JqLh|_N-LB7fO`C? zJ=LJ4nb%kb&9S}??^JsGA4=1}q64^yIGFg*;`t;m93uZ)JHnJZu#*C*FRgIAgg)~(zQiWRYNb>%jQSse`- zDOK>>RnbsT7y+{>SrC%K^*o9-P`xS(B2i{5iH5Im{Q=5~R&NLGTI8*+gtu3q-I`rw zlCwp3Bfed7ceIN>gBC{zve2sTTYTY_Pq=h~2M+qzzgaE3`pyzL8Ce zM{y;5m#>9m#WpxzWW*`h2gYJ8Y}sIgI|V7MI7O>9@Y~XSXfM=)$Hw-28keNj8yg$y zcuE^sduZ0QITX{@JUx5XNY=wqkg^Y2o6CX-oZf>wuU-t=&G}%)scPM9gRsmnXig8p z2;K+REUmRz)C0eb8=AC=tl{BqY zxyzH_!rQ9deT2Z)a$1?yIK2j_*y;@}#lFy1ad#JOOA%p>K}6@bNMPMZiuMQ*?Ghr| zCq%SUh-l*x(V`%t=^>&C7AaX7(8~n;SLM1E>hhoN|6KdgNY9*8(;pR&@|z?tHdYml z^%V{J>ZUqF4Ig|I;x)vQXk{58)#4!O(9H{JX7dzHaGs)h&QmnqdCGC$JABgQ=Vf&C z=P4R9rP8QUE2|78Ipw&9In^Y&*Ba`pF`K=O4SF0cr=B=qt*#~l8MoHdOb{|;O?|PS zxbS+4gVu#F{p&(9Z2dRWd4Wt@%U+pKkx7VY?j$4DndTZo#;nh`xVe$rP-0FecIy%I zTYhB9>T%+{8yU6U_&7#MCanK^>i3FLIs#@^FEdNb%=&L;?KiXTn_2VCtoLTtdNYgN z%vxh+*_m0}&Diy?+Ub>@+mV9s^^SE2`#Opdx}V8IxaCYTUgf_$ldht7jhG6pXEjh; z>kg03grmUInuD<1nt>3kE65=9Tk{Zlb>^zs#Zl@4Eu(?(N?oG6xZZ8Sf*5|)Zi10M z4fJ;=q3+|(LWH?za}gdno3CPad*Dg02ExyK0XgT#Syj;16$AbTUl*q**mLfAH2R3a zFV3yTby!yp*FFG+XJg>Nd8PcMop`7nd&9?FG`vS$nFtdhqg-^nCbSDm$wl$ zochFt^II@m?gRPWb3T6fBm=F}yVDWA)}4c}uR9ANIiJn#-i1Tm@!-|42o9ZBqu`h4 z>303+Gr9QqTyBR#?(eV17Fo z+OPVm7|R1j`je4d=w`AwCTR@3u9cwK9tKZcE9RQPka#s25^Vw8SD3@sQfI}OyT#CP zEg9ahsbSdWfj;ir$`B@9--PfN*UPx4c-(W1Y`i3>9km&QsQxXe@w5KT2ygWlalO?n zwnzWs?%|A#ojO-|vpW#Zy%{XeTo=^te3;qp8FTqPxZ%hzn+5dXhFWf^7gVnEIcDEn zhWCr%W)`tS+s$Hxk8i$+u|tcvq>fC%-N_Oa`zX1>_CINV7WUaAILqc>tw+5-+czW3nyTfZA?D)JR2(wDnJVPW zai;(NwALqYOF@pa4a~dUAxcF3!RZ37e{W7ddtLG7pXkprQS%+D|A#w8T>neP^t-!~ zlV2mb44$0y1nH!Q^Tj&RGC_pEvv#ZGjG|sL? zczQMy;kUEvRP5tC3OXJxTWFZO7bal;!wgh7^>7Ws|9+UEW@&@rh@ZUi4@8Chs!rm|o@EseQ;ujOkzCOiBD?+h z(g!NL{ZKorlPu;&p@`n-WRd-bGcRvd%O^g)MxQREPE2s(c(UTFy@d_+F9Rfr-ZUH`sSIAkPa za->c0lUpwdUO!s9$&2-ZQehFtWaBj<5+$wI(1hs=g7W(|K_mb0p5TE~!PFA33QIXZ1`+Pz2Zs);*^A4qm%z)1 zm#Uev6piZ;x&5l(Hm|!-iKpISiSI9s<;Ha(gc}FL$c<1QPy1!;;aFE(zi{A+l3xFE zrCG6z*yWo?6cLELC51nGLiu_ryT+vu*myKl!@j>IkDU60Mnip7&F03c#wOg=F25`( zd<&>E$=@lgsy66#H4aVGp+euE2oD@+s54^!naHK2%2@ZJ-b876KvG2E_Kzfm9}m&% z7mDCe?!xJ^qNc%A!T+_SCK*Q+OPrzhPK0TgvZtZW)cALbCypv2iM^}usA6~nnd*A? zuHs;M5Pdf!uNu8UkB6wrV5q2Vs;j}wrsT*ZWhkm;Bqm)`l&N>H2&|<|YQFrbc?^W+p}k#@kEv8GTsM1<#u>KH^3boW9?I(G^`x Q$dSBFVBjZ9#05fzQfB*mh delta 114 zcmezUmG$>m)`l&N>H2&IX69xVmgaf}mIjt)mfK798GTsM1<#u>KH^3boW9?I(G^`x Q$dSBFVBjZ9#08t1Y5dZ)H diff --git a/docs/_build/doctrees/tools/metadata.doctree b/docs/_build/doctrees/tools/metadata.doctree index c72ba6d651bcae91d42945d82fc82a0a1e0bb8eb..1608b2a6b9773548511e56bd047039df0f58161b 100644 GIT binary patch delta 165 zcmbO*gQ;x>6KezO)Zg73S+A?J%w}X@n7mO(Zu38NTSi_Za}xtoQ)4|NBSRz8$#eB= zHn(XWXJ)M3?5FF1CcAmAUMmNq+vE%8CPFCM^m6jk)h9nR5r#-gV@giRV2_=WA=U%d MjB4j*VT-@P01*)|Q2+n{ delta 163 zcmZph!8Bn86KezORMwu2tk=~U=Wf2M9>>UUU}kP+VQH>sVrXt`WMDjbrk>5_7R}?# zjP;v+bR96{H_y~-Indexing Tools
-async gen3.tools.indexing.verify_manifest.async_verify_object_manifest(commons_url, manifest_file, max_concurrent_requests=24, manifest_row_parsers={'acl': <function _get_acl_from_row>, 'authz': <function _get_authz_from_row>, 'file_name': <function _get_file_name_from_row>, 'file_size': <function _get_file_size_from_row>, 'guid': <function _get_guid_from_row>, 'md5': <function _get_md5_from_row>, 'urls': <function _get_urls_from_row>}, manifest_file_delimiter=None, output_filename='verify-manifest-errors-1660676897.0909693.log')[source]
+async gen3.tools.indexing.verify_manifest.async_verify_object_manifest(commons_url, manifest_file, max_concurrent_requests=24, manifest_row_parsers={'acl': <function _get_acl_from_row>, 'authz': <function _get_authz_from_row>, 'file_name': <function _get_file_name_from_row>, 'file_size': <function _get_file_size_from_row>, 'guid': <function _get_guid_from_row>, 'md5': <function _get_md5_from_row>, 'urls': <function _get_urls_from_row>}, manifest_file_delimiter=None, output_filename='verify-manifest-errors-1662740552.7642033.log')[source]

Verify all file object records into a manifest csv

Parameters:
diff --git a/docs/_build/html/tools/metadata.html b/docs/_build/html/tools/metadata.html index 7aa473121..eb3fa8d53 100644 --- a/docs/_build/html/tools/metadata.html +++ b/docs/_build/html/tools/metadata.html @@ -104,7 +104,7 @@

Metadata Tools
-async gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest(commons_url, manifest_file, metadata_source, auth=None, max_concurrent_requests=24, manifest_row_parsers={'guid_for_row': <function _get_guid_for_row>, 'indexed_file_object_guid': <function _query_for_associated_indexd_record_guid>}, manifest_file_delimiter=None, output_filename='ingest-metadata-manifest-errors-1660676897.4173203.log', get_guid_from_file=True, metadata_type=None)[source]
+async gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest(commons_url, manifest_file, metadata_source, auth=None, max_concurrent_requests=24, manifest_row_parsers={'guid_for_row': <function _get_guid_for_row>, 'indexed_file_object_guid': <function _query_for_associated_indexd_record_guid>}, manifest_file_delimiter=None, output_filename='ingest-metadata-manifest-errors-1662740553.22125.log', get_guid_from_file=True, metadata_type=None)[source]

Ingest all metadata records into a manifest csv

Parameters:
From d3e3d10dde4c1ebe531d3c299e7d78b8b85a4117 Mon Sep 17 00:00:00 2001 From: Spencer Axelrod Date: Tue, 13 Sep 2022 18:55:49 -0500 Subject: [PATCH 3/4] fix errors, update dependancies --- gen3/cli/discovery.py | 9 +- gen3/tools/metadata/discovery.py | 10 +- poetry.lock | 278 +++++++++++++++---------------- pyproject.toml | 1 + 4 files changed, 151 insertions(+), 147 deletions(-) diff --git a/gen3/cli/discovery.py b/gen3/cli/discovery.py index ddae7a7ce..52de95c59 100644 --- a/gen3/cli/discovery.py +++ b/gen3/cli/discovery.py @@ -98,11 +98,18 @@ def discovery_delete(ctx, guid): @click.command() @click.pass_context def discovery_generate(ctx): + """ + Generate discovery metadata from dbgap + """ + print("get auth") auth = ctx.obj["auth_factory"].get() + print("get endpoint") endpoint = ctx.obj.get("endpoint") - generate_discovery_metadata(auth, endpoint) + print(f"generate_discovery_metadata() for {endpoint}") + generate_discovery_metadata(auth, endpoint=endpoint) discovery.add_command(discovery_read, name="read") discovery.add_command(discovery_publish, name="publish") discovery.add_command(discovery_delete, name="delete") +discovery.add_command(discovery_generate, name="generate") diff --git a/gen3/tools/metadata/discovery.py b/gen3/tools/metadata/discovery.py index c6f7f96c3..151c38445 100644 --- a/gen3/tools/metadata/discovery.py +++ b/gen3/tools/metadata/discovery.py @@ -5,9 +5,10 @@ import tempfile import asyncio import os +import copy from bs4 import BeautifulSoup from urllib.parse import urlparse - +from datetime import date import requests.exceptions from gen3.metadata import Gen3Metadata @@ -27,10 +28,11 @@ logging = get_logger("__name__") -def generate_discovery_metadata(auth, endpoint): +def generate_discovery_metadata(auth, endpoint=None): """ Get discovery metadata from dbgap for currently submitted studies in a commons """ + print(f"getting currently submitted project/study data from {endpoint}...") submission = Gen3Submission(endpoint, auth_provider=auth) query_txt = """ { @@ -59,6 +61,7 @@ def generate_discovery_metadata(auth, endpoint): results = [] fields = set() + print(f"parsing {endpoint} submission query...") for raw_result in raw_results: studies = raw_result.get("studies") study_data = {} @@ -86,7 +89,7 @@ def generate_discovery_metadata(auth, endpoint): fields = fields | set(result.keys()) output_filepath = _dbgap_file_from_auth(auth) - + print(f"Writing to {output_filepath}...") with open(output_filepath, "w+", encoding="utf-8") as output_file: logging.info(f"writing headers to {output_filepath}: {fields}") output_writer = csv.DictWriter( @@ -425,6 +428,7 @@ def _get_study_description(study): dbgap_version = study.get("dbgap_version", "") or "" dbgap_participant_set = study.get("dbgap_participant_set", "") or "" dbgap_study = f"{dbgap_phs}.{dbgap_version}.{dbgap_participant_set}" + print(f"Getting study description for {dbgap_study}...") study_description = study.get("study_description") if dbgap_study != "..": diff --git a/poetry.lock b/poetry.lock index 4d3e1b9e4..cd5fc62ef 100644 --- a/poetry.lock +++ b/poetry.lock @@ -24,7 +24,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] -speedups = ["cchardet", "brotli", "aiodns"] +speedups = ["aiodns", "brotli", "cchardet"] [[package]] name = "aiosignal" @@ -87,10 +87,10 @@ optional = false python-versions = ">=3.5" [package.extras] -tests_no_zope = ["cloudpickle", "pytest-mypy-plugins", "mypy (>=0.900,!=0.940)", "pytest (>=4.3.0)", "pympler", "hypothesis", "coverage[toml] (>=5.0.2)"] -tests = ["cloudpickle", "zope.interface", "pytest-mypy-plugins", "mypy (>=0.900,!=0.940)", "pytest (>=4.3.0)", "pympler", "hypothesis", "coverage[toml] (>=5.0.2)"] -docs = ["sphinx-notfound-page", "zope.interface", "sphinx", "furo"] -dev = ["cloudpickle", "pre-commit", "sphinx-notfound-page", "sphinx", "furo", "zope.interface", "pytest-mypy-plugins", "mypy (>=0.900,!=0.940)", "pytest (>=4.3.0)", "pympler", "hypothesis", "coverage[toml] (>=5.0.2)"] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "cloudpickle"] [[package]] name = "authlib" @@ -132,6 +132,32 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "beautifulsoup4" +version = "4.11.1" +description = "Screen-scraping library" +category = "main" +optional = false +python-versions = ">=3.6.0" + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +name = "bs4" +version = "0.0.1" +description = "Dummy package for Beautiful Soup" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +beautifulsoup4 = "*" + [[package]] name = "cached-property" version = "1.5.2" @@ -179,7 +205,7 @@ resolved_reference = "bdfdeb05e45407e839fd954ce6d195d847cd8024" [[package]] name = "certifi" -version = "2022.6.15" +version = "2022.6.15.2" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false @@ -198,7 +224,7 @@ pycparser = "*" [[package]] name = "charset-normalizer" -version = "2.1.0" +version = "2.1.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = false @@ -225,7 +251,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "coverage" -version = "6.4.3" +version = "6.4.4" description = "Code coverage measurement for Python" category = "dev" optional = false @@ -239,7 +265,7 @@ toml = ["tomli"] [[package]] name = "cryptography" -version = "37.0.4" +version = "38.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "dev" optional = false @@ -252,7 +278,7 @@ cffi = ">=1.12" docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] -sdist = ["setuptools_rust (>=0.11.4)"] +sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] @@ -270,7 +296,7 @@ marshmallow-enum = ">=1.5.1,<2.0.0" typing-inspect = ">=0.4.0" [package.extras] -dev = ["types-dataclasses", "simplejson", "flake8", "portray", "hypothesis", "mypy (>=0.710)", "ipython", "pytest (>=6.2.3)"] +dev = ["pytest (>=6.2.3)", "ipython", "mypy (>=0.710)", "hypothesis", "portray", "flake8", "simplejson", "types-dataclasses"] [[package]] name = "dictionaryutils" @@ -338,7 +364,7 @@ requests = ">=2.23.0,<3.0.0" [[package]] name = "fastavro" -version = "1.5.4" +version = "1.6.1" description = "Fast read/write of AVRO files" category = "main" optional = false @@ -584,7 +610,7 @@ python-versions = ">=3.7" [[package]] name = "marshmallow" -version = "3.17.0" +version = "3.17.1" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." category = "main" optional = false @@ -594,9 +620,9 @@ python-versions = ">=3.7" packaging = ">=17.0" [package.extras] -dev = ["pytest", "pytz", "simplejson", "mypy (==0.961)", "flake8 (==4.0.1)", "flake8-bugbear (==22.6.22)", "pre-commit (>=2.4,<3.0)", "tox"] -docs = ["sphinx (==4.5.0)", "sphinx-issues (==3.0.1)", "alabaster (==0.7.12)", "sphinx-version-warning (==1.1.2)", "autodocsumm (==0.2.8)"] -lint = ["mypy (==0.961)", "flake8 (==4.0.1)", "flake8-bugbear (==22.6.22)", "pre-commit (>=2.4,<3.0)"] +dev = ["pytest", "pytz", "simplejson", "mypy (==0.971)", "flake8 (==5.0.4)", "flake8-bugbear (==22.8.22)", "pre-commit (>=2.4,<3.0)", "tox"] +docs = ["sphinx (==5.1.1)", "sphinx-issues (==3.0.1)", "alabaster (==0.7.12)", "sphinx-version-warning (==1.1.2)", "autodocsumm (==0.2.9)"] +lint = ["mypy (==0.971)", "flake8 (==5.0.4)", "flake8-bugbear (==22.8.22)", "pre-commit (>=2.4,<3.0)"] tests = ["pytest", "pytz", "simplejson"] [[package]] @@ -628,7 +654,7 @@ python-versions = "*" [[package]] name = "numpy" -version = "1.23.1" +version = "1.23.3" description = "NumPy is the fundamental package for array computing with Python." category = "main" optional = false @@ -647,7 +673,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" [[package]] name = "pandas" -version = "1.4.3" +version = "1.4.4" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false @@ -655,16 +681,16 @@ python-versions = ">=3.8" [package.dependencies] numpy = [ - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, - {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""}, {version = ">=1.18.5", markers = "platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""}, + {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, ] python-dateutil = ">=2.8.1" pytz = ">=2020.1" [package.extras] -test = ["pytest-xdist (>=1.31)", "pytest (>=6.0)", "hypothesis (>=5.5.3)"] +test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] [[package]] name = "pluggy" @@ -819,7 +845,7 @@ python-versions = ">=2.7" [[package]] name = "pytz" -version = "2022.1" +version = "2022.2.1" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -853,7 +879,7 @@ use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "requests-mock" -version = "1.9.3" +version = "1.10.0" description = "Mock out responses from the requests package" category = "dev" optional = false @@ -865,7 +891,7 @@ six = "*" [package.extras] fixture = ["fixtures"] -test = ["fixtures", "mock", "purl", "pytest", "sphinx", "testrepository (>=0.0.18)", "testtools"] +test = ["fixtures", "mock", "purl", "pytest", "sphinx", "testrepository (>=0.0.18)", "testtools", "requests-futures"] [[package]] name = "rfc3986" @@ -891,11 +917,19 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" [[package]] name = "sniffio" -version = "1.2.0" +version = "1.3.0" description = "Sniff out which async library your code is running under" category = "main" optional = false -python-versions = ">=3.5" +python-versions = ">=3.7" + +[[package]] +name = "soupsieve" +version = "2.3.2.post1" +description = "A modern CSS selector implementation for Beautiful Soup." +category = "main" +optional = false +python-versions = ">=3.6" [[package]] name = "sqlalchemy" @@ -930,18 +964,18 @@ six = "*" SQLAlchemy = ">=1.0" [package.extras] -url = ["furl (>=0.4.1)"] -timezone = ["python-dateutil"] -test_all = ["backports.zoneinfo", "pytz (>=2014.2)", "python-dateutil (>=2.6)", "python-dateutil", "pytest (>=2.7.1)", "pyodbc", "pymysql", "psycopg2cffi (>=2.8.1)", "psycopg2 (>=2.5.1)", "phonenumbers (>=5.9.2)", "pg8000 (>=1.12.4)", "pendulum (>=2.0.5)", "passlib (>=1.6,<2.0)", "mock (==2.0.0)", "isort (>=4.2.2)", "intervals (>=0.7.1)", "furl (>=0.4.1)", "flexmock (>=0.9.7)", "flake8 (>=2.4.0)", "docutils (>=0.10)", "cryptography (>=0.6)", "colour (>=0.0.4)", "arrow (>=0.3.4)", "Pygments (>=1.2)", "Jinja2 (>=2.3)", "Babel (>=1.3)"] -test = ["backports.zoneinfo", "pyodbc", "isort (>=4.2.2)", "flake8 (>=2.4.0)", "pymysql", "python-dateutil (>=2.6)", "pytz (>=2014.2)", "pg8000 (>=1.12.4)", "psycopg2cffi (>=2.8.1)", "psycopg2 (>=2.5.1)", "mock (==2.0.0)", "flexmock (>=0.9.7)", "docutils (>=0.10)", "Jinja2 (>=2.3)", "Pygments (>=1.2)", "pytest (>=2.7.1)"] -phone = ["phonenumbers (>=5.9.2)"] -pendulum = ["pendulum (>=2.0.5)"] -password = ["passlib (>=1.6,<2.0)"] -intervals = ["intervals (>=0.7.1)"] -encrypted = ["cryptography (>=0.6)"] -color = ["colour (>=0.0.4)"] -babel = ["Babel (>=1.3)"] arrow = ["arrow (>=0.3.4)"] +babel = ["Babel (>=1.3)"] +color = ["colour (>=0.0.4)"] +encrypted = ["cryptography (>=0.6)"] +intervals = ["intervals (>=0.7.1)"] +password = ["passlib (>=1.6,<2.0)"] +pendulum = ["pendulum (>=2.0.5)"] +phone = ["phonenumbers (>=5.9.2)"] +test = ["pytest (>=2.7.1)", "Pygments (>=1.2)", "Jinja2 (>=2.3)", "docutils (>=0.10)", "flexmock (>=0.9.7)", "mock (==2.0.0)", "psycopg2 (>=2.5.1)", "psycopg2cffi (>=2.8.1)", "pg8000 (>=1.12.4)", "pytz (>=2014.2)", "python-dateutil (>=2.6)", "pymysql", "flake8 (>=2.4.0)", "isort (>=4.2.2)", "pyodbc", "backports.zoneinfo"] +test_all = ["Babel (>=1.3)", "Jinja2 (>=2.3)", "Pygments (>=1.2)", "arrow (>=0.3.4)", "colour (>=0.0.4)", "cryptography (>=0.6)", "docutils (>=0.10)", "flake8 (>=2.4.0)", "flexmock (>=0.9.7)", "furl (>=0.4.1)", "intervals (>=0.7.1)", "isort (>=4.2.2)", "mock (==2.0.0)", "passlib (>=1.6,<2.0)", "pendulum (>=2.0.5)", "pg8000 (>=1.12.4)", "phonenumbers (>=5.9.2)", "psycopg2 (>=2.5.1)", "psycopg2cffi (>=2.8.1)", "pymysql", "pyodbc", "pytest (>=2.7.1)", "python-dateutil", "python-dateutil (>=2.6)", "pytz (>=2014.2)", "backports.zoneinfo"] +timezone = ["python-dateutil"] +url = ["furl (>=0.4.1)"] [[package]] name = "toml" @@ -961,7 +995,7 @@ python-versions = ">=3.7" [[package]] name = "tqdm" -version = "4.64.0" +version = "4.64.1" description = "Fast, Extensible Progress Meter" category = "main" optional = false @@ -971,10 +1005,10 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" colorama = {version = "*", markers = "platform_system == \"Windows\""} [package.extras] -telegram = ["requests"] -slack = ["slack-sdk"] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] notebook = ["ipywidgets (>=6)"] -dev = ["wheel", "twine", "py-make (>=0.1.0)"] +slack = ["slack-sdk"] +telegram = ["requests"] [[package]] name = "typing-extensions" @@ -986,7 +1020,7 @@ python-versions = ">=3.7" [[package]] name = "typing-inspect" -version = "0.7.1" +version = "0.8.0" description = "Runtime inspection utilities for typing module." category = "main" optional = false @@ -998,16 +1032,16 @@ typing-extensions = ">=3.7.4" [[package]] name = "urllib3" -version = "1.26.11" +version = "1.26.12" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" [package.extras] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] -secure = ["ipaddress", "certifi", "idna (>=2.0.0)", "cryptography (>=1.3.4)", "pyOpenSSL (>=0.14)"] -brotli = ["brotlipy (>=0.6.0)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] [[package]] name = "werkzeug" @@ -1046,16 +1080,13 @@ multidict = ">=4.0" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "fd430f1365a6a395b3eeb90163c892925cfd21d22144862c2c612920fa2a58af" +content-hash = "b97d22ee4ba622cfcdc8e8941e9a61bd443d77eb93edf166fff893a8560f03a3" [metadata.files] aiofiles = [] aiohttp = [] aiosignal = [] -anyio = [ - {file = "anyio-3.6.1-py3-none-any.whl", hash = "sha256:cb29b9c70620506a9a8f87a309591713446953302d7d995344d0d7c6c0c9a7be"}, - {file = "anyio-3.6.1.tar.gz", hash = "sha256:413adf95f93886e442aea925f3ee43baa5a765a64a0f52c6081894f9992fdd0b"}, -] +anyio = [] async-timeout = [] asyncio = [] atomicwrites = [] @@ -1064,14 +1095,13 @@ authlib = [ {file = "Authlib-0.11-py2.py3-none-any.whl", hash = "sha256:3a226f231e962a16dd5f6fcf0c113235805ba206e294717a64fa8e04ae3ad9c4"}, {file = "Authlib-0.11.tar.gz", hash = "sha256:9741db6de2950a0a5cefbdb72ec7ab12f7e9fd530ff47219f1530e79183cbaaf"}, ] -authutils = [ - {file = "authutils-6.2.1-py3-none-any.whl", hash = "sha256:57575520e7c8215ac730a16d2eed06018b7be43c4056c5f3df6ac1b9b9119166"}, - {file = "authutils-6.2.1.tar.gz", hash = "sha256:329679dd20e1251209a44b8b7b2f7f5598d2a2ef73497310fd79968554da36e6"}, -] +authutils = [] backoff = [ {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"}, {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"}, ] +beautifulsoup4 = [] +bs4 = [] cached-property = [ {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"}, {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"}, @@ -1084,20 +1114,14 @@ cdislogging = [ {file = "cdislogging-1.1.1.tar.gz", hash = "sha256:77e11648244cda3a8094b8ae6081435a2303f259612846c49ef8825c7be141e3"}, ] cdisutilstest = [] -certifi = [ - {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"}, - {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"}, -] +certifi = [] cffi = [] -charset-normalizer = [ - {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"}, - {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"}, -] -click = [] -colorama = [ - {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, - {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, +charset-normalizer = [] +click = [ + {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, + {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"}, ] +colorama = [] coverage = [] cryptography = [] dataclasses-json = [] @@ -1118,20 +1142,16 @@ h11 = [ {file = "h11-0.12.0.tar.gz", hash = "sha256:47222cb6067e4a307d535814917cd98fd0a57b6788ce715755fa2b6c28b56042"}, ] hsclient = [] -httpcore = [ - {file = "httpcore-0.15.0-py3-none-any.whl", hash = "sha256:1105b8b73c025f23ff7c36468e4432226cbb959176eab66864b8e31c4ee27fa6"}, - {file = "httpcore-0.15.0.tar.gz", hash = "sha256:18b68ab86a3ccf3e7dc0f43598eaddcf472b602aba29f9aa6ab85fe2ada3980b"}, -] -httpx = [ - {file = "httpx-0.23.0-py3-none-any.whl", hash = "sha256:42974f577483e1e932c3cdc3cd2303e883cbfba17fe228b0f63589764d7b9c4b"}, - {file = "httpx-0.23.0.tar.gz", hash = "sha256:f28eac771ec9eb4866d3fb4ab65abd42d38c424739e80c08d8d20570de60b0ef"}, -] +httpcore = [] +httpx = [] humanfriendly = [] idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] -indexclient = [] +indexclient = [ + {file = "indexclient-2.1.0.tar.gz", hash = "sha256:777476eb97febfcd663b7f9454fba5151dee4d17254bfa0dfb5ff040c59b8532"}, +] indexd = [] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, @@ -1139,53 +1159,21 @@ iniconfig = [ ] itsdangerous = [] jinja2 = [] -jsonschema = [] -markupsafe = [ - {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, - {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, - {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, - {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, - {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, - {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, +jsonschema = [ + {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, + {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, ] +markupsafe = [] marshmallow = [] -marshmallow-enum = [] +marshmallow-enum = [ + {file = "marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58"}, + {file = "marshmallow_enum-1.5.1-py2.py3-none-any.whl", hash = "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072"}, +] multidict = [] -mypy-extensions = [] +mypy-extensions = [ + {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, + {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, +] numpy = [] packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, @@ -1196,7 +1184,19 @@ pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, ] -psycopg2 = [] +psycopg2 = [ + {file = "psycopg2-2.9.3-cp310-cp310-win32.whl", hash = "sha256:083707a696e5e1c330af2508d8fab36f9700b26621ccbcb538abe22e15485362"}, + {file = "psycopg2-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:d3ca6421b942f60c008f81a3541e8faf6865a28d5a9b48544b0ee4f40cac7fca"}, + {file = "psycopg2-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:9572e08b50aed176ef6d66f15a21d823bb6f6d23152d35e8451d7d2d18fdac56"}, + {file = "psycopg2-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:a81e3866f99382dfe8c15a151f1ca5fde5815fde879348fe5a9884a7c092a305"}, + {file = "psycopg2-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:cb10d44e6694d763fa1078a26f7f6137d69f555a78ec85dc2ef716c37447e4b2"}, + {file = "psycopg2-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:4295093a6ae3434d33ec6baab4ca5512a5082cc43c0505293087b8a46d108461"}, + {file = "psycopg2-2.9.3-cp38-cp38-win32.whl", hash = "sha256:34b33e0162cfcaad151f249c2649fd1030010c16f4bbc40a604c1cb77173dcf7"}, + {file = "psycopg2-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:0762c27d018edbcb2d34d51596e4346c983bd27c330218c56c4dc25ef7e819bf"}, + {file = "psycopg2-2.9.3-cp39-cp39-win32.whl", hash = "sha256:8cf3878353cc04b053822896bc4922b194792df9df2f1ad8da01fb3043602126"}, + {file = "psycopg2-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:06f32425949bd5fe8f625c49f17ebb9784e1e4fe928b7cce72edc36fb68e4c0c"}, + {file = "psycopg2-2.9.3.tar.gz", hash = "sha256:8e841d1bf3434da985cc5ef13e6f75c8981ced601fd70cc6bf33351b91562981"}, +] py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, @@ -1205,14 +1205,8 @@ pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] -pyjwt = [ - {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, - {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, -] -pyparsing = [ - {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, - {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, -] +pyjwt = [] +pyparsing = [] pypfb = [] pyreadline3 = [] pyrsistent = [ @@ -1243,7 +1237,10 @@ pytest = [ {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, ] pytest-cov = [] -python-dateutil = [] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] python-json-logger = [] pytz = [] pyyaml = [ @@ -1277,10 +1274,7 @@ pyyaml = [ {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"}, {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"}, ] -requests = [ - {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"}, - {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"}, -] +requests = [] requests-mock = [] rfc3986 = [ {file = "rfc3986-1.5.0-py2.py3-none-any.whl", hash = "sha256:a86d6e1f5b1dc238b218b012df0aa79409667bb209e58da56d0b94704e712a97"}, @@ -1290,10 +1284,8 @@ six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] -sniffio = [ - {file = "sniffio-1.2.0-py3-none-any.whl", hash = "sha256:471b71698eac1c2112a40ce2752bb2f4a4814c22a54a3eed3676bc0f5ca9f663"}, - {file = "sniffio-1.2.0.tar.gz", hash = "sha256:c4666eecec1d3f50960c6bdf61ab7bc350648da6c126e3cf6898d8cd4ddcd3de"}, -] +sniffio = [] +soupsieve = [] sqlalchemy = [ {file = "SQLAlchemy-1.3.24-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:87a2725ad7d41cd7376373c15fd8bf674e9c33ca56d0b8036add2d634dba372e"}, {file = "SQLAlchemy-1.3.24-cp27-cp27m-win32.whl", hash = "sha256:f597a243b8550a3a0b15122b14e49d8a7e622ba1c9d29776af741f1845478d79"}, @@ -1330,7 +1322,10 @@ sqlalchemy = [ {file = "SQLAlchemy-1.3.24-cp39-cp39-win_amd64.whl", hash = "sha256:09083c2487ca3c0865dc588e07aeaa25416da3d95f7482c07e92f47e080aa17b"}, {file = "SQLAlchemy-1.3.24.tar.gz", hash = "sha256:ebbb777cbf9312359b897bf81ba00dae0f5cb69fba2a18265dcc18a6f5ef7519"}, ] -sqlalchemy-utils = [] +sqlalchemy-utils = [ + {file = "SQLAlchemy-Utils-0.37.9.tar.gz", hash = "sha256:4667edbdcb1ece011076b69772ef524bfbb17cc97e03f11ee6b85d98e7741d61"}, + {file = "SQLAlchemy_Utils-0.37.9-py3-none-any.whl", hash = "sha256:bb6f4da8ac044cb0dd4d0278b1fb434141a5ee9d1881c757a076830ddbb04160"}, +] toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, @@ -1341,8 +1336,5 @@ typing-extensions = [] typing-inspect = [] urllib3 = [] werkzeug = [] -xmltodict = [ - {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, - {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, -] +xmltodict = [] yarl = [] diff --git a/pyproject.toml b/pyproject.toml index b06ad98cd..1a107f24e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ python-dateutil = "*" aiofiles = "^0.8.0" pandas = "^1.4.2" httpx = "*" +bs4 = "^0.0.1" [tool.poetry.dev-dependencies] pytest = "^6.0.0" From 3ead6f7d7f44d6ad26a38a3502f6cf4a94611497 Mon Sep 17 00:00:00 2001 From: Spencer Axelrod Date: Wed, 14 Sep 2022 02:10:14 -0500 Subject: [PATCH 4/4] add create_new_discovery_page functionailty --- gen3/cli/discovery.py | 32 +++++++-- gen3/tools/metadata/discovery.py | 112 +++++++++++++++++++++++++++++-- 2 files changed, 131 insertions(+), 13 deletions(-) diff --git a/gen3/cli/discovery.py b/gen3/cli/discovery.py index 52de95c59..a61736c8f 100644 --- a/gen3/cli/discovery.py +++ b/gen3/cli/discovery.py @@ -3,7 +3,8 @@ import click from gen3.tools.metadata.discovery import ( - generate_discovery_metadata, + scrape_discovery_metadata, + create_new_discovery_page_file, publish_discovery_metadata, output_expanded_discovery_metadata, try_delete_discovery_guid, @@ -97,19 +98,36 @@ def discovery_delete(ctx, guid): @click.command() @click.pass_context -def discovery_generate(ctx): +def discovery_scrape(ctx): """ - Generate discovery metadata from dbgap + Scrape discovery metadata from dbgap """ - print("get auth") auth = ctx.obj["auth_factory"].get() - print("get endpoint") endpoint = ctx.obj.get("endpoint") - print(f"generate_discovery_metadata() for {endpoint}") - generate_discovery_metadata(auth, endpoint=endpoint) + output_file = scrape_discovery_metadata(auth, endpoint=endpoint) + click.echo(output_file) + + +@click.command() +@click.argument("dbgap_metadata_file", required=True) +@click.argument("discovery_metadata_file", required=True) +@click.pass_context +def discovery_generate(ctx, dbgap_metadata_file, discovery_metadata_file): + """ + Generate a metadata TSV file of new discovery page metadata. + New metadata is determined by reading current explore page metadata and scraped dbgap metadata. + """ + output_file = create_new_discovery_page_file( + dbgap_metadata_file, + discovery_metadata_file, + output_filepath="new_discovery_page_metadata.tsv", + ) + + click.echo(output_file) discovery.add_command(discovery_read, name="read") discovery.add_command(discovery_publish, name="publish") discovery.add_command(discovery_delete, name="delete") +discovery.add_command(discovery_scrape, name="scrape") discovery.add_command(discovery_generate, name="generate") diff --git a/gen3/tools/metadata/discovery.py b/gen3/tools/metadata/discovery.py index 151c38445..9f31c49f1 100644 --- a/gen3/tools/metadata/discovery.py +++ b/gen3/tools/metadata/discovery.py @@ -28,11 +28,11 @@ logging = get_logger("__name__") -def generate_discovery_metadata(auth, endpoint=None): +def scrape_discovery_metadata(auth, endpoint=None): """ Get discovery metadata from dbgap for currently submitted studies in a commons """ - print(f"getting currently submitted project/study data from {endpoint}...") + logging.info(f"Getting currently submitted project/study data from '{endpoint}'...") submission = Gen3Submission(endpoint, auth_provider=auth) query_txt = """ { @@ -61,7 +61,6 @@ def generate_discovery_metadata(auth, endpoint=None): results = [] fields = set() - print(f"parsing {endpoint} submission query...") for raw_result in raw_results: studies = raw_result.get("studies") study_data = {} @@ -89,7 +88,7 @@ def generate_discovery_metadata(auth, endpoint=None): fields = fields | set(result.keys()) output_filepath = _dbgap_file_from_auth(auth) - print(f"Writing to {output_filepath}...") + logging.info(f"Writing to {output_filepath}") with open(output_filepath, "w+", encoding="utf-8") as output_file: logging.info(f"writing headers to {output_filepath}: {fields}") output_writer = csv.DictWriter( @@ -103,6 +102,107 @@ def generate_discovery_metadata(auth, endpoint=None): for row in results: output_writer.writerow(row) + return output_filepath + + +def create_new_discovery_page_file( + dbgap_metadata_file, + current_explore_file, + output_filepath="new_discovery_page_metadata.tsv", +): + """ + Generate new discovery page metadata from dbgap metadata and existing discovery metadata + """ + explore_page_dict = {} + explore_fieldnames = [] + + # Build dictionary from current discovery metadata + with open(current_explore_file) as curr_file: + logging.info("Reading discovery page metadata...") + curr_reader = csv.DictReader(curr_file, delimiter="\t") + explore_fieldnames = curr_reader.fieldnames + + for i, line in enumerate(curr_reader): + key = line["guid"] + explore_page_dict[key] = line + + # Build a dictionary of new studies and metadata from dbgap metadata + # Write to an output file for submission + new_metadata_dict = {} + with open(dbgap_metadata_file) as dbgap_file, open( + output_filepath, "w+", encoding="utf-8" + ) as output_file: + dbgap_reader = csv.DictReader(dbgap_file, delimiter="\t") + logging.info("Building new discovery page metadata manifest...") + + for i, line in enumerate(dbgap_reader): + if (line["study_id"] not in explore_page_dict.keys()) and ( + line["project_id"] not in explore_page_dict.keys() + ): + logging.info(line["study_id"] + ": " + line["project_id"]) + guid = ( + line["study_id"] + if (line["study_id"][:3] == "phs") + else line["project_id"] + ) + curr_dict_row = {} + tag_string = line["tags"] + tag_list = ast.literal_eval(tag_string) + for field in explore_fieldnames: + if "tag" in field: + if tag_list: + curr_tag_dict = tag_list.pop(0) + curr_tag = ( + curr_tag_dict["category"] + ": " + curr_tag_dict["name"] + ) + curr_dict_row[field] = curr_tag + else: + curr_dict_row[field] = "" + + elif field == "dbgap_url": + study_link_ending = guid[:-3] if (guid[:3] == "phs") else ".." + dbgap_url = ( + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=" + + study_link_ending + ) + curr_dict_row[field] = dbgap_url + + elif field == "guid": + curr_dict_row[field] = guid + + elif field == "authz": + curr_dict_row[field] = line[field].strip(']["') + + elif field == "study_description": + curr_dict_row[field] = line[field].replace("\n", "") + + elif field == "__manifest": + curr_dict_row[field] = "" + + elif field == "null": + curr_dict_row[field] = "" + + else: + curr_dict_row[field] = line[field] + + new_metadata_dict[guid] = curr_dict_row + + else: + pass + + output_writer = csv.DictWriter( + output_file, + delimiter="\t", + fieldnames=explore_fieldnames, + extrasaction="ignore", + ) + output_writer.writeheader() + for i, guid in enumerate(new_metadata_dict.keys()): + line = new_metadata_dict[guid] + output_writer.writerow(line) + + return output_filepath + async def output_expanded_discovery_metadata( auth, endpoint=None, limit=500, use_agg_mds=False @@ -428,7 +528,7 @@ def _get_study_description(study): dbgap_version = study.get("dbgap_version", "") or "" dbgap_participant_set = study.get("dbgap_participant_set", "") or "" dbgap_study = f"{dbgap_phs}.{dbgap_version}.{dbgap_participant_set}" - print(f"Getting study description for {dbgap_study}...") + logging.info(f"Getting study description for {dbgap_study}...") study_description = study.get("study_description") if dbgap_study != "..": @@ -459,7 +559,7 @@ def _get_study_description(study): links.decompose() study_description = ( - study_description.getText().strip().replace("\t", " ") + study_description.getText().strip().replace("\t", "") + f"\n\nNOTE: This text was scraped from https://www.ncbi.nlm.nih.gov/ on {date.today()} and may not include exact formatting or images." ) logging.debug(f"{study_description}")