From 1fcf98d7a6c67f906e327724576b711003a38a6d Mon Sep 17 00:00:00 2001 From: ankit-a-mehta Date: Mon, 13 Aug 2018 15:37:17 +0000 Subject: [PATCH 1/7] Done --- __pycache__/__init__.cpython-36.pyc | Bin 165 -> 151 bytes .../__pycache__/__init__.cpython-36.pyc | Bin 179 -> 165 bytes .../__pycache__/build.cpython-36.pyc | Bin 568 -> 577 bytes q01_load_data/build.py | 11 +++++++++-- .../tests/__pycache__/__init__.cpython-36.pyc | Bin 185 -> 171 bytes .../tests/__pycache__/tests.cpython-36.pyc | Bin 1577 -> 1563 bytes 6 files changed, 9 insertions(+), 2 deletions(-) diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index b4b72099455159189764a0ad76aa1fabbc62e96a..f69d09099389d38fb9bd90ce189716a644ad2890 100644 GIT binary patch delta 55 zcmZ3=IGvHhn3tDp?$W~Oi5%w2F8UeyxvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3rqpy0}`nK delta 69 zcmbQvxRjB@n3tD}Z`Piui5%vJLHeP^sYS*5iAfom#TlvD#idC_iMfe+nTdJ&E~&-Y YCHV#V`30$Y@dZWsS*gh-#S@cE0UPfa$^ZZW diff --git a/q01_load_data/__pycache__/__init__.cpython-36.pyc b/q01_load_data/__pycache__/__init__.cpython-36.pyc index 92b3ac2d3dd2682028775eba5dae0f9658f2aebd..fb4f13e0188b94701c1735f3a0193e4af6a1ab75 100644 GIT binary patch delta 55 zcmdnYxRjB@n3tDp?$W~Oi5%w2LHZf_xvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3!DM#4HDr1 delta 69 zcmZ3=xS5f|n3tDJb=ID!i5%vJ3HqVMsYS*5iAfom#TlvD#idC_iMfe+nTdJ&E~&-Y YCHV#V`30$Y@dZWsS*gh-#S@d90V}B)82|tP diff --git a/q01_load_data/__pycache__/build.cpython-36.pyc b/q01_load_data/__pycache__/build.cpython-36.pyc index e27baf631ae2be11ac2de9e6f21754e2497753d8..40b18a9c81de8e3a1346a64fb2c3e8a15dab9878 100644 GIT binary patch delta 267 zcmdnNa*(Con3tDp-qONoIYtJC#|%h-3CMN;;$j6Lk;0I|n8Ogon8Fmrl){|C(!vnM zoXVWa)Xc~TlnG|gWPJ%##o$*Zt*4)oSdyrpT9KTZqnn(cTc8W#>s90wS0n?i00WTG z%s}kSz%Vh)gVAr|4ku-Q{fzwFRQ<%f?939~MBUuf44^6D`9;~q1&PV2`pNkzsgt!C zYXzgYic%9(;=$(MV$Mx0n7o@&ONSL`S}_O6a7HdhF-952A^{*(tjky{e2c3nH8CYV zwIVq+=N4;XK|xOCMS5fae%C5;$jqIlwm9q1hO@mZ?P35=A|SS7x4hOMS_zB PnH0Ghfy`o%PdJzXg$zE! diff --git a/q01_load_data/build.py b/q01_load_data/build.py index 69d7209..7126d92 100644 --- a/q01_load_data/build.py +++ b/q01_load_data/build.py @@ -1,5 +1,12 @@ +# %load q01_load_data/build.py import pandas as pd - +path = './data/excel-comp-data.xlsx' def q01_load_data(path): - "write your solution here" + df = pd.read_excel(path) + df['state'] = df['state'].map(lambda x: x.lower()) + df['total'] = df['Jan'] + df['Feb'] + df['Mar'] + return df + +q01_load_data(path) + diff --git a/q01_load_data/tests/__pycache__/__init__.cpython-36.pyc b/q01_load_data/tests/__pycache__/__init__.cpython-36.pyc index 2a2dfc72b54611f7a4f5a7734a9c697f606c0089..71fa8415c037aeb2b870a995f0a0c874bffafb92 100644 GIT binary patch delta 55 zcmdnVxSEl}n3tDp!P3I$i5%w25&9YVxvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3p@eu2@?DO delta 69 zcmZ3@xRa5?n3tD}ZT6n1i5%vJY5JkXsYS*5iAfom#TlvD#idC_iMfe+nTdJ&E~&-Y YCHV#V`30$Y@dZWsS*gh-#S@b}0Vxg{6951J diff --git a/q01_load_data/tests/__pycache__/tests.cpython-36.pyc b/q01_load_data/tests/__pycache__/tests.cpython-36.pyc index 76e04c8a5e2c476c2fba795d8bae25fd09cc88ed..68658da9a6fbb1271de89b77e8aa3a1daa7a0627 100644 GIT binary patch delta 58 zcmZ3nA2AR#B cXP4v`=;s%t=EWBjn+a From 97bf115b4085b46b08cd931a32dace116cef887c Mon Sep 17 00:00:00 2001 From: ankit-a-mehta Date: Mon, 13 Aug 2018 16:25:20 +0000 Subject: [PATCH 2/7] Done --- .../__pycache__/__init__.cpython-36.pyc | Bin 180 -> 166 bytes .../__pycache__/build.cpython-36.pyc | Bin 626 -> 655 bytes q02_append_row/build.py | 23 ++++++++++++++++-- .../tests/__pycache__/__init__.cpython-36.pyc | Bin 186 -> 172 bytes .../tests/__pycache__/tests.cpython-36.pyc | Bin 1355 -> 1341 bytes 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/q02_append_row/__pycache__/__init__.cpython-36.pyc b/q02_append_row/__pycache__/__init__.cpython-36.pyc index de0cf6163be25541da218f6f8a0c4d005729c09c..5ee58feb313dec28a6c440f4733ccfd8e0ee4282 100644 GIT binary patch delta 55 zcmdnOxQvm*n3tDp?$W~Oi5%w2!TK5bxvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3tRx}pc3T( delta 69 zcmZ3+xP_6!n3tE!c-Ee%i5%vJiTa_%sYS*5iAfom#TlvD#idC_iMfe+nTdJ&E~&-Y YCHV#V`30$Y@dZWsS*gh-#S@cU057f?D*ylh diff --git a/q02_append_row/__pycache__/build.cpython-36.pyc b/q02_append_row/__pycache__/build.cpython-36.pyc index 508826776138177582df97d666bb284c1e1484b0..1aeab8fe7470a33cf58fc39823b0d5e4884911e3 100644 GIT binary patch literal 655 zcmZuu&5qMB5cW@!X1iO2_J+7{&7sKx;#MIfB#@9;A+ZutI7Dv7+P3S|sqKmul~Z{V zZrpeXp2Am7yuzLs8zE&SY|YH)FZRqglc$r(_~(z!$8VgFUu3Wt;umOkh7Jf&MQR!& zt67Y^<`|huy0lIy0bXa2K>C%`BRP`Uks?RmlOp?#8OUWp{bY0TxKUa_1Hy^V56ddV zO72>zrmgP!X*icVWp^zNk8lMY3EE3Edxg%C13A(!^gw|=AS-$#2fAXA%`gHVFr&yM z11675I$%x$gBr(|p1%?6+tb%Ztl`m>c0zgnO3pohD~wNFAPQ!J5`&{6|8P3M*#M6M z<|e53+-^J1ge#a&df^sz9@=cIbl-_N#T&Qps)g=kCDu1fHx<)PE^sU#w7IdpXl2!E zkX4Vugl(?3OOVj?Mz3V+s?EidM)bX0Lu2%(YQ9}6DEqy?clSe%RqPo$Li6OD#(iQY zcrJ3E{Y~vz52$r-J=3;+c5UQdINhn`y!>zN^3LBZW9iti8Drj8S%5Z)i#=R~;>Bns%Rb#p#IbP+-#N`V9opPs`#CJ-_FLK8i&R?Os1p*MDH4$`- zOt3L>!7;Kr+QcHJ1UN)riAaFHCt3U(Z;^Ky4b#iT)1o#K3Xoo&qs(YA;^42*-7$hE zJ-Mbk+Ebvw2C`N3gjDpJ?3ia2?-|55w2F4Tr&Zi@NMLfqc2OUpT>9nlmDOI&wx+W) zXX?&t)6CY&s=(jKCh${L2L4vs5PRcgjfF)+J!cse2YS^Qs|wwK`kW%d?sCYV}SbA_!h3!IA-<5D-W{zmKrB>VFz|KOB$ zxh&V(t<{C=N+gtxmQ8-5+=Vx7ZjiTVt*O+xcX=xtkgiyD8WjB7&Mz0wirWGPy}ayn z4fA#z4(}gKhBw_uAS|Vi=phdK5LekGB)7zN50}{E)&*vqc(Jl-E4|s&x}5)4dVcR0 i=cDrYP$Ti!Ufv#3Bhc9;tLQ0R=0sjjW0IuZ# diff --git a/q02_append_row/build.py b/q02_append_row/build.py index af3701d..16baba0 100644 --- a/q02_append_row/build.py +++ b/q02_append_row/build.py @@ -1,11 +1,30 @@ +# %load q02_append_row/build.py import pandas as pd import sys, os #sys.path.append(os.path.join(os.path.dirname(os.curdir))) from greyatomlib.pandas_guided_project.q01_load_data.build import q01_load_data - +path = './data/excel-comp-data.xlsx' def q02_append_row(path): - "write your solution here" + data_set = q01_load_data(path) + ''' + Approach 1 + ''' + data_set.at['Grand Total', 'Jan'] = data_set['Jan'].sum() + data_set.at['Grand Total', 'Feb'] = data_set['Feb'].sum() + data_set.at['Grand Total', 'Mar'] = data_set['Mar'].sum() + data_set.at['Grand Total', 'total'] = data_set['total'].sum() + + ''' + Approach 2 + ''' + #data_set.loc['Grand Total', 'Jan'] = data_set['Jan'].sum() + #data_set.loc['Grand Total', 'Feb'] = data_set['Feb'].sum() + #data_set.loc['Grand Total', 'Mar'] = data_set['Mar'].sum() + #data_set.loc['Grand Total', 'total'] = data_set['total'].sum() + return data_set +q02_append_row(path) + diff --git a/q02_append_row/tests/__pycache__/__init__.cpython-36.pyc b/q02_append_row/tests/__pycache__/__init__.cpython-36.pyc index dab3ecac755f4882f0d0d63dd70ab57f6fb428f8..ef74c25e61f27d55b388aade2d8815f38a5eb45e 100644 GIT binary patch delta 55 zcmdnRxQ3C#n3tDp?$W~Oi5%w2k@^|=xvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3%mgD5)%CY delta 69 zcmZ3(xQmg)n3tEUXwIIfi5%vJ>H49?sYS*5iAfom#TlvD#idC_iMfe+nTdJ&E~&-Y YCHV#V`30$Y@dZWsS*gh-#S@df07F(8ga7~l diff --git a/q02_append_row/tests/__pycache__/tests.cpython-36.pyc b/q02_append_row/tests/__pycache__/tests.cpython-36.pyc index 742ee79571b722fe1fcab2f42f1dbcae23a14bd1..401ffec091ccf0a46dc6867526d85f92ebc0ac06 100644 GIT binary patch delta 58 zcmX@jwU>*-n3tDp?$W~OjT~ngmBaKi@^e%56Z5h&OLP-;b5k=)67|FLi?WLg5|dN) Olk-zjH?uLhG6MiYeiNMl delta 72 zcmdnXb()LAn3tDJY{8zWjT~ng4O8?(i&Kk=^%IjaGK(`(vx`fUiV|}Z^D-0j^j%Vm cvrF;|^z#c+^WqDN^0QKtONuxDV02{$0G`+y%K!iX From d470850d817d75b11cbd9887c2f761ece3b1afe4 Mon Sep 17 00:00:00 2001 From: ankit-a-mehta Date: Sat, 18 Aug 2018 10:05:06 +0000 Subject: [PATCH 3/7] Done --- ...ed_Project\\Scraped_Data\\scrapeddata.csv" | 78 ++++++++++++++++++ .../__pycache__/__init__.cpython-36.pyc | Bin 182 -> 168 bytes .../__pycache__/build.cpython-36.pyc | Bin 888 -> 1612 bytes q03_scrape_clean/build.py | 65 ++++++++++++++- .../tests/__pycache__/__init__.cpython-36.pyc | Bin 188 -> 174 bytes .../tests/__pycache__/tests.cpython-36.pyc | Bin 1396 -> 1382 bytes 6 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 "D:\\GreyAtom\\Guided_Project\\Scraped_Data\\scrapeddata.csv" diff --git "a/D:\\GreyAtom\\Guided_Project\\Scraped_Data\\scrapeddata.csv" "b/D:\\GreyAtom\\Guided_Project\\Scraped_Data\\scrapeddata.csv" new file mode 100644 index 0000000..3c9e411 --- /dev/null +++ "b/D:\\GreyAtom\\Guided_Project\\Scraped_Data\\scrapeddata.csv" @@ -0,0 +1,78 @@ +United States of America,Federal state,US USA 840,US,00,,,U.S.,U.S.,U.S.A. +Alabama,State,US-AL,AL,01,AL,AL,Ala.,Ala., +Alaska,State,US-AK,AK,02,AK,AK,Alaska,Alaska,Alas. +Arizona,State,US-AZ,AZ,04,AZ,AZ,Ariz.,Ariz.,Az. +Arkansas,State,US-AR,AR,05,AR,AR,Ark.,Ark., +California,State,US-CA,CA,06,CA,CF,Calif.,Calif.,"Ca., Cal." +Colorado,State,US-CO,CO,08,CO,CL,Colo.,Colo.,Col. +Connecticut,State,US-CT,CT,09,CT,CT,Conn.,Conn.,Ct. +Delaware,State,US-DE,DE,10,DE,DL,Del.,Del.,De. +District of Columbia,Federal district,US-DC,DC,11,DC,DC,D.C.,D.C.,Wash. D.C. +Florida,State,US-FL,FL,12,FL,FL,Fla.,Fla.,"Fl., Flor." +Georgia,State,US-GA,GA,13,GA,GA,Ga.,Ga.,Geo. +Hawaii,State,US-HI,HI,15,HI,HA,Hawaii,Hawaii,H.I. +Idaho,State,US-ID,ID,16,ID,ID,Idaho,Idaho,"Id., Ida." +Illinois,State,US-IL,IL,17,IL,IL,Ill.,Ill.,"Il., Ills., Ill's" +Indiana,State,US-IN,IN,18,IN,IN,Ind.,Ind.,In. +Iowa,State,US-IA,IA,19,IA,IA,Iowa,Iowa,"Ia., Ioa.[1]" +Kansas,State,US-KS,KS,20,KS,KA,Kans.,Kan.,"Ks., Ka." +Kentucky,State (Commonwealth),US-KY,KY,21,KY,KY,Ky.,Ky.,"Ken., Kent." +Louisiana,State,US-LA,LA,22,LA,LA,La.,La., +Maine,State,US-ME,ME,23,ME,ME,Maine,Maine,Me. +Maryland,State,US-MD,MD,24,MD,MD,Md.,Md., +Massachusetts,State (Commonwealth),US-MA,MA,25,MA,MS,Mass.,Mass., +Michigan,State,US-MI,MI,26,MI,MC,Mich.,Mich., +Minnesota,State,US-MN,MN,27,MN,MN,Minn.,Minn.,Mn. +Mississippi,State,US-MS,MS,28,MS,MI,Miss.,Miss., +Missouri,State,US-MO,MO,29,MO,MO,Mo.,Mo., +Montana,State,US-MT,MT,30,MT,MT,Mont.,Mont., +Nebraska,State,US-NE,NE,31,NE,NB,Nebr.,Neb., +Nevada,State,US-NV,NV,32,NV,NV,Nev.,Nev.,Nv. +New Hampshire,State,US-NH,NH,33,NH,NH,N.H.,N.H., +New Jersey,State,US-NJ,NJ,34,NJ,NJ,N.J.,N.J.,N.Jersey +New Mexico,State,US-NM,NM,35,NM,NM,N. Mex.,N.M.,New M. +New York,State,US-NY,NY,36,NY,NY,N.Y.,N.Y.,N. York +North Carolina,State,US-NC,NC,37,NC,NC,N.C.,N.C.,N. Car. +North Dakota,State,US-ND,ND,38,ND,ND,N. Dak.,N.D.,NoDak +Ohio,State,US-OH,OH,39,OH,OH,Ohio,Ohio,"O., Oh." +Oklahoma,State,US-OK,OK,40,OK,OK,Okla.,Okla.,Ok. +Oregon,State,US-OR,OR,41,OR,OR,Oreg.,Ore.,Or. +Pennsylvania,State (Commonwealth),US-PA,PA,42,PA,PA,Pa.,Pa.,"Penn., Penna." +Rhode Island,State,US-RI,RI,44,RI,RI,R.I.,R.I.,"R.I. & P.P., R. Isl." +South Carolina,State,US-SC,SC,45,SC,SC,S.C.,S.C.,S. Car. +South Dakota,State,US-SD,SD,46,SD,SD,S. Dak.,S.D.,SoDak +Tennessee,State,US-TN,TN,47,TN,TN,Tenn.,Tenn., +Texas,State,US-TX,TX,48,TX,TX,Tex.,Texas,Tx. +Utah,State,US-UT,UT,49,UT,UT,Utah,Utah,Ut. +Vermont,State,US-VT,VT,50,VT,VT,Vt.,Vt., +Virginia,State (Commonwealth),US-VA,VA,51,VA,VA,Va.,Va.,Virg. +Washington,State,US-WA,WA,53,WA,WN,Wash.,Wash.,"Wa., Wn.[2]" +West Virginia,State,US-WV,WV,54,WV,WV,W. Va.,W.Va.,"W.V., W. Virg." +Wisconsin,State,US-WI,WI,55,WI,WS,Wis.,Wis.,"Wi., Wisc." +Wyoming,State,US-WY,WY,56,WY,WY,Wyo.,Wyo.,Wy. +American Samoa,Insular area (Territory),ASASM016US-AS,AS,60,AS,AS,A.S.,, +Guam,Insular area (Territory),GUGUM316US-GU,GU,66,GU,GU,Guam,, +Northern Mariana Islands,Insular area (Commonwealth),MPMNP580US-MP,MP,69,MP,CM,M.P.,,CNMI[3] +Puerto Rico,Insular area (Territory),PRPRI630US-PR,PR,72,PR,PR,P.R.,, +U.S. Virgin Islands,Insular area (Territory),VIVIR850US-VI,VI,78,VI,VI,V.I.,,U.S.V.I. +U.S. Minor Outlying Islands,Insular areas,UMUMI581US-UM,UM,74,,,,, +Baker Island,island,UM-81,,81,,,,,XB[4] +Howland Island,island,UM-84,,84,,,,,XH[4] +Jarvis Island,island,UM-86,,86,,,,,XQ[4] +Johnston Atoll,atoll,UM-67,,67,,,,,XU[4] +Kingman Reef,atoll,UM-89,,89,,,,,XM[4] +Midway Islands,atoll,UM-71,,71,,,,,QM[4] +Navassa Island,island,UM-76,,76,,,,,XV[4] +Palmyra Atoll[5],atoll[5],UM-95,,95,,,,,XL[4] +Wake Island,atoll,UM-79,,79,,,,,QW[4] +Micronesia,Freely associated state,FMFSM583,FM,64,FM,,,, +Marshall Islands,Freely associated state,MHMHL584,MH,68,MH,,,, +Palau,Freely associated state,PWPLW585,PW,70,PW,,,, +U.S. Armed Forces – Americas[6],US military mail code,,,,AA,,,, +U.S. Armed Forces – Europe[7],US military mail code,,,,AE,,,, +U.S. Armed Forces – Pacific[8],US military mail code,,,,AP,,,, +Northern Mariana Islands,Obsolete postal code[9],,,,CM,,,, +Panama Canal Zone,Obsolete postal code,PZPCZ594,,,CZ,,,, +Nebraska,Obsolete postal code[10],,,,NB,,,, +Philippine Islands,Obsolete postal code,PHPHL608[11],,,PI,,,, +Trust Territory of the Pacific Islands,Obsolete postal code,PCPCI582,,,TT,,,, diff --git a/q03_scrape_clean/__pycache__/__init__.cpython-36.pyc b/q03_scrape_clean/__pycache__/__init__.cpython-36.pyc index e99e1734469b0d74feccbb92229a5c70c95d8904..c8a71d2010a7284fbcc57dc51984e8741b21bc12 100644 GIT binary patch delta 55 zcmdnSxPppfL zc^trBfJgy~B5EZluxg6~Tw6P^YwH9~ZQa1FtrvLAjvH~4IiOp$_Cas0Ec0514JYli z6LbLEL6?*`6AFAgl-dBRAcqG^$gd66$4&6LXJ(P+f7;XHa3 z<%~vT$i>CTtVZ8NQiXgPo(#vsP%5HWNY2j%yNZa4cqS*7;e}@x!~^*fL|%bcq1Y`+W?>b%C4bAc)s#G`K2So~f(u}i&`MH>|Tg5N_=(GR~? z&9WJLv%MZtPmBGs1^auIU%GL#^ntZS{nvF-E@!6*Shwm-C{@*l%=RYp z`C?GEA@gq4DLXNaTPsv`%Pruj3wHU(DJmPZeFs&I(gQE#Tj=unDN;V|&@S!K{%gDH zsjawO_JF&;cv5=6cU0XfZ^a#YYi0dri4OpG4X(BU^$fBQpnkbcw^tZ&KZf|4`yV*5 z7jLf+czf}Vv1&KIO*@diLwhC2F{a%Q3DtPZ;}uxLt(n7u@X0}HJE(|n~q`&1zvh%wwIqhJvw_P*!5$@)3awqL>Udg z6+C7WbvB*|0uvKHC5oKM^@0K(PUMxg<{ANABV+R<%GmO&VLCs|iI7Yj@8VpmO zaha0|qd!i0rVghhjgsr5x9(5gqUvOPGJgEtH_D?ldXnOuLhWOQvxSM+-)1 z2$ToA+Ac%_KPwa~oC=pSh^Hg<6H F;lFuH$OHfY literal 888 zcmZWn&2H2%5Vn(SHccto6KW-dB5svX1L}2Ef#AfUm#UY&EZ)qtn>2}C+W~g9J#FE} zgYXVKiLXdVeE=>!G0yT+l|=J-G9G{P%{YfUJL%}hx5-b=*mrj4iO7FRX5JD2175QU z@7G`w^lLZ?`!$+G7*vBQ#*lE>V?wyK=Kkdj0tm0!Bn5uL_M;o(W5)&6>z|NtK=zu< zd?v80;w!f1SKJ0wxZ+i`=D_#ZRj{Hep6e5t$FtyfH12o!zbHgsIOPuh&xLr8RYMp; za!oa=#FENY0P&a+M+)@C$s?WVa`%HSEoKYZ>CDKwv!!gaIcn5i`^`4N@jiFMg7%u$ z-1z8n_q3%$keygz(PZ*0d)1&W3-PyMpXQrBAu|sMR;;4&La_E_!{b|?Ip%pas#+9C zF8}zI=`BjVeV3ma)Fv0Rd1>Z&VY-3LTI z!k=({`TTHd3M~{)iyG3iEz{+a($=J2cj~@EolR6u2#o0;Y4@)P8QQ#n!s0BsLdoY(;}^Ae5>XE*8cv z6uMZ>d><~ha_U(_7k0Y#)JDnHApL`9FZ$VRASJ 0): + for column in head_tags: + column_names.append(column.find(text=True)) + + list_of_cells = [] + data_tags = row.findAll('td') + if(len(data_tags) > 0): + for row_data in data_tags: + row_text = row_data.text.strip() + '''if(len(row_data.find_all('a')) > 0): + #print ('Length of a is: ', len(row_data.find_all('a'))) + #print ('Text of a is: ', row_data.find('a').text) + row_text = row_data.find('a').text + elif(len(row_data.find_all('style')) > 0): + row_text = row_data.find('span').text + else: + row_text = row_data.find(text=True) + ''' + list_of_cells.append(row_text) + i = i + 1 + #print ('Row is: ', list_of_cells) + list_of_rows.append(list_of_cells) + + outfile = open('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv', 'w', newline='') + writer = csv.writer(outfile) + for row_for_csv in list_of_rows: + if(row_for_csv.__contains__('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840')): + index_data = row_for_csv.index('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') + row_for_csv.remove('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') + row_for_csv.insert(index_data, 'US USA 840') + writer.writerow(row_for_csv) + outfile.flush() + outfile.close() + + df1 = pd.read_csv('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv') + # % new empty columns are added to match the dataframe results + df1[''] = '' + df1[' '] = '' + df1[' '] = '' + df1[' '] = '' + df1[' '] = '' + #print (list(df1)) + #print (len(list_of_rows)) + return df1 +q03_scrape_clean(url) + + diff --git a/q03_scrape_clean/tests/__pycache__/__init__.cpython-36.pyc b/q03_scrape_clean/tests/__pycache__/__init__.cpython-36.pyc index bee36fb3d7bd79ff797a0f6b391b7a8a93cf3fac..98fcf68f9ecf0f4dfad2b8bca17bd996074beaed 100644 GIT binary patch delta 55 zcmdnPxQ>y-n3tDp?$W~Oi5%w2(fS$rxvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3w!|ZI1>c` delta 69 zcmZ3-xQCI$n3tF9*up(g6FJNcGxbA@Q;UlA6O%GBi!)NQi%XM=5_1#tG86OkT~dp) ZOY#f!^9xe*;tPuMvr>~wiYF%d003F18RY-~ diff --git a/q03_scrape_clean/tests/__pycache__/tests.cpython-36.pyc b/q03_scrape_clean/tests/__pycache__/tests.cpython-36.pyc index 8529c871c1d09e03d187c6dc0d0810c3df16d3e3..27fc803067701e96b167b2444f10bb91d4c9e8d2 100644 GIT binary patch delta 58 zcmeyu^^A+dn3tDp?$W~OjT~PXl_T^s@^e%56Z5h&OLP-;b5k=)67|FLi?WLg5|dN) Olk-zjH|sLFGXnrzJrqFz delta 72 zcmaFH^@WSWn3tC;bkUxujT~PX4b${Pi&Kk=^%IjaGK(`(vx`fUiV|}Z^D-0j^j%Vm cvrF;|^z#c+^WqDN^0QKtONuwkGr2PZ0LxJtu>b%7 From db01689e0a056f188fa1a2a9c179ce59dd717e97 Mon Sep 17 00:00:00 2001 From: ankit-a-mehta Date: Sat, 18 Aug 2018 17:26:30 +0000 Subject: [PATCH 4/7] Done --- .../__pycache__/build.cpython-36.pyc | Bin 1612 -> 1722 bytes q03_scrape_clean/build.py | 6 ++++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/q03_scrape_clean/__pycache__/build.cpython-36.pyc b/q03_scrape_clean/__pycache__/build.cpython-36.pyc index 43d165361544359ce41ffd769024771094b4b26c..3d3122e912dfe7d3a8502f062117284f540234a7 100644 GIT binary patch delta 278 zcmX@Zvx}G0n3tDpcUVRAl8K!E7#D5kVDx2-su4~RmSo5h%Mwo!N#Sp10`YAaQuu*1 zKa>`M(xOmW3`&dJFqBA?NY)59Gd43a0@*dfSyCWYjc_o7ro`lCrnh=k5}|pSC8;S2 z!6k_$sl^KUX$p?HsYRK|iN6>Xs+bfMs+bjk1PhQ*(3GCs!n{Q37ISe)(Jl6()PkJE zLko4#>~M8gFK8}j73_L ax3GpusW31w6lsA74OBvNvI?6zqZI(~iATTy delta 146 zcmdnRdxnS8n3tE!_I-JD&_vFEjAffS7=0P}Quu3xQ}}Hdz_jq>LMAD;8sT6DP4UU= zncj;0VpOPNQc$R3Rsa$#Kte%NZ1O4QC6gzxoMlv=9LJi+=r#E)s}nC12O|_R7pYEG ZVhfX!XJB9`QUwvpsD#SoNo?wj768JPAtwL; diff --git a/q03_scrape_clean/build.py b/q03_scrape_clean/build.py index 44c426d..34b85dc 100644 --- a/q03_scrape_clean/build.py +++ b/q03_scrape_clean/build.py @@ -57,14 +57,16 @@ def q03_scrape_clean(url): outfile.close() df1 = pd.read_csv('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv') + df1['United States of America'] = df1['United States of America'].str.replace(' ', '') + # % new empty columns are added to match the dataframe results df1[''] = '' df1[' '] = '' df1[' '] = '' df1[' '] = '' df1[' '] = '' - #print (list(df1)) - #print (len(list_of_rows)) + print (type(df1)) + print (df1.shape) return df1 q03_scrape_clean(url) From cf4307a014ae166b4c80368c9414320d837e8c6b Mon Sep 17 00:00:00 2001 From: ankit-a-mehta Date: Mon, 20 Aug 2018 15:27:22 +0000 Subject: [PATCH 5/7] Done --- "..\\data\\scrapeddata.csv" | 78 ++++++++++++++ .../__pycache__/build.cpython-36.pyc | Bin 1722 -> 924 bytes q03_scrape_clean/build.py | 64 ++---------- .../__pycache__/__init__.cpython-36.pyc | Bin 177 -> 163 bytes q04_mapping/__pycache__/build.cpython-36.pyc | Bin 877 -> 2289 bytes q04_mapping/build.py | 95 +++++++++++++++++- .../tests/__pycache__/__init__.cpython-36.pyc | Bin 183 -> 169 bytes .../tests/__pycache__/test.cpython-36.pyc | Bin 1685 -> 1671 bytes 8 files changed, 178 insertions(+), 59 deletions(-) create mode 100644 "..\\data\\scrapeddata.csv" diff --git "a/..\\data\\scrapeddata.csv" "b/..\\data\\scrapeddata.csv" new file mode 100644 index 0000000..c2c2fff --- /dev/null +++ "b/..\\data\\scrapeddata.csv" @@ -0,0 +1,78 @@ +United States of America,Federal state,US USA 840,US,00,Unnamed: 5,Unnamed: 6,U.S.,U.S..1,U.S.A.,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14 +Alabama,State,US-AL,AL,1.0,AL,AL,Ala.,Ala.,,,,,, +Alaska,State,US-AK,AK,2.0,AK,AK,Alaska,Alaska,Alas.,,,,, +Arizona,State,US-AZ,AZ,4.0,AZ,AZ,Ariz.,Ariz.,Az.,,,,, +Arkansas,State,US-AR,AR,5.0,AR,AR,Ark.,Ark.,,,,,, +California,State,US-CA,CA,6.0,CA,CF,Calif.,Calif.,"Ca., Cal.",,,,, +Colorado,State,US-CO,CO,8.0,CO,CL,Colo.,Colo.,Col.,,,,, +Connecticut,State,US-CT,CT,9.0,CT,CT,Conn.,Conn.,Ct.,,,,, +Delaware,State,US-DE,DE,10.0,DE,DL,Del.,Del.,De.,,,,, +DistrictofColumbia,Federal district,US-DC,DC,11.0,DC,DC,D.C.,D.C.,Wash. D.C.,,,,, +Florida,State,US-FL,FL,12.0,FL,FL,Fla.,Fla.,"Fl., Flor.",,,,, +Georgia,State,US-GA,GA,13.0,GA,GA,Ga.,Ga.,Geo.,,,,, +Hawaii,State,US-HI,HI,15.0,HI,HA,Hawaii,Hawaii,H.I.,,,,, +Idaho,State,US-ID,ID,16.0,ID,ID,Idaho,Idaho,"Id., Ida.",,,,, +Illinois,State,US-IL,IL,17.0,IL,IL,Ill.,Ill.,"Il., Ills., Ill's",,,,, +Indiana,State,US-IN,IN,18.0,IN,IN,Ind.,Ind.,In.,,,,, +Iowa,State,US-IA,IA,19.0,IA,IA,Iowa,Iowa,"Ia., Ioa.[1]",,,,, +Kansas,State,US-KS,KS,20.0,KS,KA,Kans.,Kan.,"Ks., Ka.",,,,, +Kentucky,State (Commonwealth),US-KY,KY,21.0,KY,KY,Ky.,Ky.,"Ken., Kent.",,,,, +Louisiana,State,US-LA,LA,22.0,LA,LA,La.,La.,,,,,, +Maine,State,US-ME,ME,23.0,ME,ME,Maine,Maine,Me.,,,,, +Maryland,State,US-MD,MD,24.0,MD,MD,Md.,Md.,,,,,, +Massachusetts,State (Commonwealth),US-MA,MA,25.0,MA,MS,Mass.,Mass.,,,,,, +Michigan,State,US-MI,MI,26.0,MI,MC,Mich.,Mich.,,,,,, +Minnesota,State,US-MN,MN,27.0,MN,MN,Minn.,Minn.,Mn.,,,,, +Mississippi,State,US-MS,MS,28.0,MS,MI,Miss.,Miss.,,,,,, +Missouri,State,US-MO,MO,29.0,MO,MO,Mo.,Mo.,,,,,, +Montana,State,US-MT,MT,30.0,MT,MT,Mont.,Mont.,,,,,, +Nebraska,State,US-NE,NE,31.0,NE,NB,Nebr.,Neb.,,,,,, +Nevada,State,US-NV,NV,32.0,NV,NV,Nev.,Nev.,Nv.,,,,, +NewHampshire,State,US-NH,NH,33.0,NH,NH,N.H.,N.H.,,,,,, +NewJersey,State,US-NJ,NJ,34.0,NJ,NJ,N.J.,N.J.,N.Jersey,,,,, +NewMexico,State,US-NM,NM,35.0,NM,NM,N. Mex.,N.M.,New M.,,,,, +NewYork,State,US-NY,NY,36.0,NY,NY,N.Y.,N.Y.,N. York,,,,, +NorthCarolina,State,US-NC,NC,37.0,NC,NC,N.C.,N.C.,N. Car.,,,,, +NorthDakota,State,US-ND,ND,38.0,ND,ND,N. Dak.,N.D.,NoDak,,,,, +Ohio,State,US-OH,OH,39.0,OH,OH,Ohio,Ohio,"O., Oh.",,,,, +Oklahoma,State,US-OK,OK,40.0,OK,OK,Okla.,Okla.,Ok.,,,,, +Oregon,State,US-OR,OR,41.0,OR,OR,Oreg.,Ore.,Or.,,,,, +Pennsylvania,State (Commonwealth),US-PA,PA,42.0,PA,PA,Pa.,Pa.,"Penn., Penna.",,,,, +RhodeIsland,State,US-RI,RI,44.0,RI,RI,R.I.,R.I.,"R.I. & P.P., R. Isl.",,,,, +SouthCarolina,State,US-SC,SC,45.0,SC,SC,S.C.,S.C.,S. Car.,,,,, +SouthDakota,State,US-SD,SD,46.0,SD,SD,S. Dak.,S.D.,SoDak,,,,, +Tennessee,State,US-TN,TN,47.0,TN,TN,Tenn.,Tenn.,,,,,, +Texas,State,US-TX,TX,48.0,TX,TX,Tex.,Texas,Tx.,,,,, +Utah,State,US-UT,UT,49.0,UT,UT,Utah,Utah,Ut.,,,,, +Vermont,State,US-VT,VT,50.0,VT,VT,Vt.,Vt.,,,,,, +Virginia,State (Commonwealth),US-VA,VA,51.0,VA,VA,Va.,Va.,Virg.,,,,, +Washington,State,US-WA,WA,53.0,WA,WN,Wash.,Wash.,"Wa., Wn.[2]",,,,, +WestVirginia,State,US-WV,WV,54.0,WV,WV,W. Va.,W.Va.,"W.V., W. Virg.",,,,, +Wisconsin,State,US-WI,WI,55.0,WI,WS,Wis.,Wis.,"Wi., Wisc.",,,,, +Wyoming,State,US-WY,WY,56.0,WY,WY,Wyo.,Wyo.,Wy.,,,,, +AmericanSamoa,Insular area (Territory),ASASM016US-AS,AS,60.0,AS,AS,A.S.,,,,,,, +Guam,Insular area (Territory),GUGUM316US-GU,GU,66.0,GU,GU,Guam,,,,,,, +NorthernMarianaIslands,Insular area (Commonwealth),MPMNP580US-MP,MP,69.0,MP,CM,M.P.,,CNMI[3],,,,, +PuertoRico,Insular area (Territory),PRPRI630US-PR,PR,72.0,PR,PR,P.R.,,,,,,, +U.S.VirginIslands,Insular area (Territory),VIVIR850US-VI,VI,78.0,VI,VI,V.I.,,U.S.V.I.,,,,, +U.S.MinorOutlyingIslands,Insular areas,UMUMI581US-UM,UM,74.0,,,,,,,,,, +BakerIsland,island,UM-81,,81.0,,,,,XB[4],,,,, +HowlandIsland,island,UM-84,,84.0,,,,,XH[4],,,,, +JarvisIsland,island,UM-86,,86.0,,,,,XQ[4],,,,, +JohnstonAtoll,atoll,UM-67,,67.0,,,,,XU[4],,,,, +KingmanReef,atoll,UM-89,,89.0,,,,,XM[4],,,,, +MidwayIslands,atoll,UM-71,,71.0,,,,,QM[4],,,,, +NavassaIsland,island,UM-76,,76.0,,,,,XV[4],,,,, +PalmyraAtoll[5],atoll[5],UM-95,,95.0,,,,,XL[4],,,,, +WakeIsland,atoll,UM-79,,79.0,,,,,QW[4],,,,, +Micronesia,Freely associated state,FMFSM583,FM,64.0,FM,,,,,,,,, +MarshallIslands,Freely associated state,MHMHL584,MH,68.0,MH,,,,,,,,, +Palau,Freely associated state,PWPLW585,PW,70.0,PW,,,,,,,,, +U.S.ArmedForces–Americas[6],US military mail code,,,,AA,,,,,,,,, +U.S.ArmedForces–Europe[7],US military mail code,,,,AE,,,,,,,,, +U.S.ArmedForces–Pacific[8],US military mail code,,,,AP,,,,,,,,, +NorthernMarianaIslands,Obsolete postal code[9],,,,CM,,,,,,,,, +PanamaCanalZone,Obsolete postal code,PZPCZ594,,,CZ,,,,,,,,, +Nebraska,Obsolete postal code[10],,,,NB,,,,,,,,, +PhilippineIslands,Obsolete postal code,PHPHL608[11],,,PI,,,,,,,,, +TrustTerritoryofthePacificIslands,Obsolete postal code,PCPCI582,,,TT,,,,,,,,, diff --git a/q03_scrape_clean/__pycache__/build.cpython-36.pyc b/q03_scrape_clean/__pycache__/build.cpython-36.pyc index 3d3122e912dfe7d3a8502f062117284f540234a7..50b3979d3130a4829f0d2ccb71613352d344c753 100644 GIT binary patch delta 581 zcmYjMJ#Q015Z>^x5Z_6rdD*NU2UjmLl2+MFK_8qyVbJ!dh04k(g8T&erIW{Vl~)u&VavXiFe<&T-^SAyCDPSQ@okyNDS6O3PJ=}{EOKq8)oj`=sZ$4LCqEdD_Mg0b)BZ^h2$*=thkV2d z1s;-4hSslT2~uN3(h6+neA8A`GH8h)< zw9`(|0c-_bViV_Q6!d71cI$m(j1Jt_a4vdwV5rU`iVGP{i)74-eED!DWiF0JBbp88 z(TgajB*H_+FGt!MeH{rIvT1la91lYwv7{lsxZw0U!ZKo+m>8NDo*h^&tS@1SSKyT> zHcGTIDx)-FvofVsn#5R{l~r1?O-y1D`=wdgm#A`Jxx}d&i{DE-c8I$&NQ0nX%sc2Y z`r+5ASvF&rG=Zxn8)XZ2ofkI$X>qq~!TxULl}_9&Jz#AS?`2(-tJxU>)~z}dN>#NX zv%SfDy70?3WbRj;vJ;zeYlW(Axdj|`!LDANp|U~Rd#GxZE_fl|Kvz%Ckn~80bV-l& zUs_d9ZpH1g2i*O|qtXSwqv}?9EAEh6E8{nVe*n0vakUMor;&jG^~-Ity)ps!V~DT0 z|A7;G@%9RVw-*nzmOJrn(t+$jIUs{ODBg|t$PVePOl=<{(gWSw=$(zez0tcHy?2b{ zzPwWo)}tc#%R&5JeUAYQ#s0IvP8MmR9EmRyy4=<+uc0p(uK^52461?QK-d=w8_~`rzr#B~(rRPtIh)@zf=PafZc|M+S4AT`p#uA^4wLt(6 zC*oQebA^Dek#`~vPBq{O7dW?n7rv6Dvc{IUq41a0*{?hV6*dY9a_f?bA ztAYwClyylZZ^GwsQ-8TVjWQB6wG45R1PzT&l0;bv&6JDtoMuEhLh>kAHUmpp`V-wb zhvT_whv5XSkE2Y4p>iOZE|kk@%C4z$!ArRen(JUTSI#sk#7sGpgbAw59FFD`lTcGy zaENMfnkP^d<>WldB(&rv2Sd!D+6OzzDtH1v1t@elhPfRU*Lo$iP(_q2)H$y zuc>QPpWT4gPNRhKJt%Cjr6Z<{>rV^3x-RPki%dWL6Y-BA=*>5y8B6I1XE3RU_%Nk2 ziAQIQzo;i;G+~5}a-0z?!prsimt1)4s?3+ bIAx?r=tDifVjG0MQ@_5~U9)Z4X7lJ@FO%wA diff --git a/q03_scrape_clean/build.py b/q03_scrape_clean/build.py index 34b85dc..cb7f6c0 100644 --- a/q03_scrape_clean/build.py +++ b/q03_scrape_clean/build.py @@ -6,68 +6,18 @@ import requests sys.path.append(os.path.join(os.path.dirname(os.curdir))) -from bs4 import BeautifulSoup - url = 'https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations' def q03_scrape_clean(url): - data_from_page = requests.get(url) - page_text = data_from_page.text - data_soup = BeautifulSoup(page_text, 'lxml') - table_data = data_soup.find('table', class_='wikitable sortable') - - column_names = [] - list_of_rows = [] - i = 0 - #Skipping the first table row element since we dont want it in our data frame - for row in table_data.find_all('tr')[9:]: - head_tags = row.findAll('th') - if(len(head_tags) > 0): - for column in head_tags: - column_names.append(column.find(text=True)) - - list_of_cells = [] - data_tags = row.findAll('td') - if(len(data_tags) > 0): - for row_data in data_tags: - row_text = row_data.text.strip() - '''if(len(row_data.find_all('a')) > 0): - #print ('Length of a is: ', len(row_data.find_all('a'))) - #print ('Text of a is: ', row_data.find('a').text) - row_text = row_data.find('a').text - elif(len(row_data.find_all('style')) > 0): - row_text = row_data.find('span').text - else: - row_text = row_data.find(text=True) - ''' - list_of_cells.append(row_text) - i = i + 1 - #print ('Row is: ', list_of_cells) - list_of_rows.append(list_of_cells) + #data_from_page = requests.get(url) + list_of_tables = pd.read_html(url, header = 0, skiprows = 11, attrs = {'class' : 'wikitable sortable'}) + df1 = pd.DataFrame(list_of_tables[0]) + df1 = df1.rename(columns = {'.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840' : 'US USA 840'}) + df1['United States of America'] = df1['United States of America'].str.replace(' ', '') + df1.to_csv('..\data\scrapeddata.csv', index = False) - outfile = open('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv', 'w', newline='') - writer = csv.writer(outfile) - for row_for_csv in list_of_rows: - if(row_for_csv.__contains__('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840')): - index_data = row_for_csv.index('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') - row_for_csv.remove('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') - row_for_csv.insert(index_data, 'US USA 840') - writer.writerow(row_for_csv) - outfile.flush() - outfile.close() - - df1 = pd.read_csv('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv') - df1['United States of America'] = df1['United States of America'].str.replace(' ', '') - - # % new empty columns are added to match the dataframe results - df1[''] = '' - df1[' '] = '' - df1[' '] = '' - df1[' '] = '' - df1[' '] = '' - print (type(df1)) - print (df1.shape) return df1 + q03_scrape_clean(url) diff --git a/q04_mapping/__pycache__/__init__.cpython-36.pyc b/q04_mapping/__pycache__/__init__.cpython-36.pyc index ee0618f2c89597e870d478c12dab4aec6d95e7c7..22c5dbbe6366234b6d3f11467799dda18d46859e 100644 GIT binary patch delta 55 zcmdnUxR{Z{n3tDp?$W~Oi5%w20s0yFxvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3mgII=Mvcf delta 69 zcmZ3?xRH^=n3tE!W7eLii5%vJar&XfsYS*5iAfom#TlvD#idC_iMfe+nTdJ&E~&-Y YCHV#V`30$Y@dZWsS*gh-#S@bp0WxD5Gynhq diff --git a/q04_mapping/__pycache__/build.cpython-36.pyc b/q04_mapping/__pycache__/build.cpython-36.pyc index 8283165768b4bf279030f79c34448f8435bb4125..7667d9463b151d46989d42a1dde32c74a0b9b0ca 100644 GIT binary patch literal 2289 zcmZ`)&2Jnv6rb^Y?suA`X(|v}l>=HJo1`rQl?stkH5H-}(l&}T1P$J`H<{^t#pCTJ z%gluZsRt_Y7jWRlzvL^YTsii{vu6_)1Tz}HpP!$fJ@)Utcdyl|f0xd_dDYRhKeVL> z0sb7moIpb~qDPviD~UX$q~RG#nw|v`MRx3Xj$YZio~y=cUJaxX)#HZOfH9L;KWbi+ z*u(+ZA}*+1kOBz>ctR+oNYUI8Sgz+U#-Zb_Z zPDuB8%5fx&Cv+%`XP60#OGOVc3pT*ycCN6!gqdtv00SRl-MNSGPsr*LlY z%YIJz?;2}@#hYi_r)f;Laq=SITX-v`ryOq|rR+tX;St>)rG##0urZwbCzF6s;%6)k z>4^Yq5)j5Z#MBFG4(dVc%3L`WLA_g&EqtZOlN((-lSfz9RYhi{KvFAE*i1rh@ ztv%MheO0=J8`{K$JF9W0sKKc9!el?3zf;s;e50%vR_GRW@LD7F*ZN%7wX@SB4ZJqg z%m&PC!p!Dk=F{_T(S(_sWvgg~D6Gx3vRyR5M_bMP?MN#e(!8aWPGN%<=7YzlM;fn_ z7HN|XS$S=g9o`6=MF)JZoIfgT@b{p+QQQbyvIdOX2yc;f(wQUKf2ffT$j(BpFXYXI z+*rt4z|J*b=jMTDM(1$^JjBC@&Q_-Z5Jkdno-zd>fb3C(^W0}Las=Y+vQ~l9=P+n; zdd@*hF6Hp-@kq^IIlXAu>Dc~Qs`5mfjcLm z(`6bKZe^I|lx?LGo=teSAE!yGh)cd7rwQL0<2Z;e9$c$Gyhi2W;Bc__$^D&K z{cz9)*p+zmFbOy%-2um(=G}DM-HRy;M)-~1ouO`bW^|!3q3QNEh0EyKgyuXK#tG$2 zGMO3j&&_d=5YLsZkE6(QB-)E0t|OWVyNVBC<(vhXFjI&YVaQ*!rwo>7qUrl1*gg)D z-1mhAlj*sz8I9AIRM?=UjC*cnoKA%`jwbo3utrguQ-LyAno&q^(xm~52#3)uf^hDw z3uD3}$cQovj9D7{89t$2O}4(gCrGF&K|mh#>gsw`lJuIRG@8VTpWvA0UNZ`E?x$lJ z;hcO z5RB+?Mmdwg3b9n$3^#O6H5D;V2xhmbNoePr$BQA_tkg*SxNt8}0WA909KrZju!jffTw-R@~buzAK zQc)H{Q(pA7b`hfLT7+_REN3`GVdnfK8NPQfs+VwxoY?IqP3+WmXj7%9?Ykfz z1Mv@+eep2#~E!~E?B1IRpHKeBLMh- z4*-1)!k_`I@hvg9A_E4?8$5=eqRKP=8dm`$%)5gK=UpI>{==TLou>*JEIA?8*;ndBqhN`1%`CuIS&NlT*!=PH4U= z^@?BUE?2anqNF1E$n}MhZ6cvxS*v8l3!{^k3PyEy-j$5A2X}IL^g3&xv!yss@~*7e zyzOngsA-ckdNOf*0O;n$R^WV5%YxSW1dL6bC8x{#5aO&)^IX|LyHaO+$~}}NEwi!W zlx2nfVneD;-*W3|1A%aL-P;|_O;!rVuWe*x#>&FjuoO_Cu|%}i6I56ac+LVFwA8F# zZq%`j*;#g03R 0): + for column in head_tags: + column_names.append(column.find(text=True)) + + list_of_cells = [] + data_tags = row.findAll('td') + if(len(data_tags) > 0): + for row_data in data_tags: + row_text = row_data.text.strip() + list_of_cells.append(row_text) + i = i + 1 + #print ('Row is: ', list_of_cells) + list_of_rows.append(list_of_cells) + + outfile = open('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv', 'w', newline='') + writer = csv.writer(outfile) + for row_for_csv in list_of_rows: + if(row_for_csv.__contains__('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840')): + index_data = row_for_csv.index('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') + row_for_csv.remove('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') + row_for_csv.insert(index_data, 'US USA 840') + writer.writerow(row_for_csv) + outfile.flush() + outfile.close() + + df1 = pd.read_csv('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv') + df1['United States of America'] = df1['United States of America'].str.replace(' ', '') + + # % new empty columns are added to match the dataframe results + df1[''] = '' + df1[' '] = '' + df1[' '] = '' + df1[' '] = '' + df1[' '] = '' + return df1 + + def q04_mapping(path1,path2): - "write your solution here" + df_from_appended_row = q02_append_row(path1) + df_from_scraped_data = q03_scrape_clean(path2) + + '''Approach 1 for creating a dictionary + mapping = df_from_scraped_data.set_index('United States of America').to_dict()['U.S.'] + ''' + '''Approach 2 for creating a dictionary + ''' + mapping = dict(zip(df_from_scraped_data['United States of America'].str.lower(), df_from_scraped_data['U.S.'])) + df_final = df_from_appended_row + #Inseting 'abbr' at column 6 - Approach 2 for this is given below + df_final.insert(6, 'abbr', '') + '''Approach 1 + Advantage of this approach is if there are no matching keys in the dictioinary, then the column values are mapped to NaN + ''' + df_final['abbr'] = df_final['state'].str.lower().map(mapping) + '''Approach 2 + df_final['abbr'] = df_final['state'].str.lower().replace(mapping) + df_final.loc[df_from_appended_row['abbr'] == df_final['state'],'abbr'] = float('nan') + ''' + + '''Approach 2 for ordering columns on a specified index + Reindexing the columns can also be done using reindex_axis method. + df_final = df_final.reindex_axis(['account', 'name', 'street', 'city', 'state', 'postal-code', 'abbr', 'Jan', 'Feb', 'Mar', 'total'], axis=1) + ''' + return df_final + diff --git a/q04_mapping/tests/__pycache__/__init__.cpython-36.pyc b/q04_mapping/tests/__pycache__/__init__.cpython-36.pyc index eef3d6b1231b64feb39c7d081ba06fa45658052b..889011cc07b01c11d0da5ad53f78db24feb64b7f 100644 GIT binary patch delta 55 zcmdnaxRQ~>n3tDp?$W~Oi5%w2Vfq>QxvBbzdD)pIx{12EsTn1S`r-LS*~JBk$*KCu K`6;Or3)}(iSrYL8 delta 69 zcmZ3~wiYF$y0{~*c8VCRY diff --git a/q04_mapping/tests/__pycache__/test.cpython-36.pyc b/q04_mapping/tests/__pycache__/test.cpython-36.pyc index 7f7c96e914a8d9dd0deb7c46cab1b7ff9ad3e05a..aa4c2773811ae3673dab5cf30ffb02e3d7acbf11 100644 GIT binary patch delta 58 zcmbQr+s?~j%*)F)cWGhtMvmu<%0c=W`MIh3iFw(XCAx{axv3c?iTdIBMcKs#iOH$@ N$@wX%n-!Q!SpYCe6PN%1 delta 72 zcmZqYoyyB$%*)H=vv*I_MvmuQb`iV&ynZ+5Y*~O(vMTxnId6|iM`Yx%( b*(Lb}`uPQ^dGQ5B`B|ySCB>VCm`Ygyn3ftE From 9d2abcda879624468309eaf0011622c88d94f353 Mon Sep 17 00:00:00 2001 From: ankit-a-mehta Date: Mon, 20 Aug 2018 18:06:35 +0000 Subject: [PATCH 6/7] Done --- data/scraped.csv | 28 ++--- .../__pycache__/build.cpython-36.pyc | Bin 577 -> 577 bytes .../__pycache__/build.cpython-36.pyc | Bin 655 -> 655 bytes q02_append_row/build.py | 7 -- q04_mapping/__pycache__/build.cpython-36.pyc | Bin 2289 -> 1140 bytes q04_mapping/build.py | 101 +++--------------- 6 files changed, 29 insertions(+), 107 deletions(-) diff --git a/data/scraped.csv b/data/scraped.csv index 51c10a1..49f8e0b 100644 --- a/data/scraped.csv +++ b/data/scraped.csv @@ -1,4 +1,4 @@ -,United States of America,Federal state,US USA 840,US,00,,,U.S.,U.S.,U.S.A.,,,,, +,United States of America,Federal state,".mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840",US,00,,,U.S.,U.S.,U.S.A.,,,,, 12,Alabama,State,US-AL,AL,01,AL,AL,Ala.,Ala.,,,,,, 13,Alaska,State,US-AK,AK,02,AK,AK,Alaska,Alaska,Alas.,,,,, 14,Arizona,State,US-AZ,AZ,04,AZ,AZ,Ariz.,Ariz.,Az.,,,,, @@ -9,7 +9,7 @@ 19,Delaware,State,US-DE,DE,10,DE,DL,Del.,Del.,De.,,,,, 20,DistrictofColumbia,Federal district,US-DC,DC,11,DC,DC,D.C.,D.C.,Wash. D.C.,,,,, 21,Florida,State,US-FL,FL,12,FL,FL,Fla.,Fla.,"Fl., Flor.",,,,, -22,Georgia,State,US-GA,GA,13,GA,GA,Ga.,Ga.,,,,,, +22,Georgia,State,US-GA,GA,13,GA,GA,Ga.,Ga.,Geo.,,,,, 23,Hawaii,State,US-HI,HI,15,HI,HA,Hawaii,Hawaii,H.I.,,,,, 24,Idaho,State,US-ID,ID,16,ID,ID,Idaho,Idaho,"Id., Ida.",,,,, 25,Illinois,State,US-IL,IL,17,IL,IL,Ill.,Ill.,"Il., Ills., Ill's",,,,, @@ -50,12 +50,12 @@ 60,WestVirginia,State,US-WV,WV,54,WV,WV,W. Va.,W.Va.,"W.V., W. Virg.",,,,, 61,Wisconsin,State,US-WI,WI,55,WI,WS,Wis.,Wis.,"Wi., Wisc.",,,,, 62,Wyoming,State,US-WY,WY,56,WY,WY,Wyo.,Wyo.,Wy.,,,,, -63,AmericanSamoa,Insular area (Territory),AS ASM 016 US-AS,AS,60,AS,AS,A.S.,,,,,,, -64,Guam,Insular area (Territory),GU GUM 316 US-GU,GU,66,GU,GU,Guam,,,,,,, -65,NorthernMarianaIslands,Insular area (Commonwealth),MP MNP 580 US-MP,MP,69,MP,CM,M.P.,,CNMI[3],,,,, -66,PuertoRico,Insular area (Territory),PR PRI 630 US-PR,PR,72,PR,PR,P.R.,,,,,,, -67,U.S.VirginIslands,Insular area (Territory),VI VIR 850 US-VI,VI,78,VI,VI,V.I.,,U.S.V.I.,,,,, -68,U.S.MinorOutlyingIslands,Insular areas,UM UMI 581 US-UM,UM,74,,,,,,,,,, +63,AmericanSamoa,Insular area (Territory),ASASM016US-AS,AS,60,AS,AS,A.S.,,,,,,, +64,Guam,Insular area (Territory),GUGUM316US-GU,GU,66,GU,GU,Guam,,,,,,, +65,NorthernMarianaIslands,Insular area (Commonwealth),MPMNP580US-MP,MP,69,MP,CM,M.P.,,CNMI[3],,,,, +66,PuertoRico,Insular area (Territory),PRPRI630US-PR,PR,72,PR,PR,P.R.,,,,,,, +67,U.S.VirginIslands,Insular area (Territory),VIVIR850US-VI,VI,78,VI,VI,V.I.,,U.S.V.I.,,,,, +68,U.S.MinorOutlyingIslands,Insular areas,UMUMI581US-UM,UM,74,,,,,,,,,, 69,BakerIsland,island,UM-81,,81,,,,,XB[4],,,,, 70,HowlandIsland,island,UM-84,,84,,,,,XH[4],,,,, 71,JarvisIsland,island,UM-86,,86,,,,,XQ[4],,,,, @@ -65,14 +65,14 @@ 75,NavassaIsland,island,UM-76,,76,,,,,XV[4],,,,, 76,PalmyraAtoll[5],atoll[5],UM-95,,95,,,,,XL[4],,,,, 77,WakeIsland,atoll,UM-79,,79,,,,,QW[4],,,,, -78,Micronesia,Freely associated state,FM FSM 583,FM,64,FM,,,,,,,,, -79,MarshallIslands,Freely associated state,MH MHL 584,MH,68,MH,,,,,,,,, -80,Palau,Freely associated state,PW PLW 585,PW,70,PW,,,,,,,,, +78,Micronesia,Freely associated state,FMFSM583,FM,64,FM,,,,,,,,, +79,MarshallIslands,Freely associated state,MHMHL584,MH,68,MH,,,,,,,,, +80,Palau,Freely associated state,PWPLW585,PW,70,PW,,,,,,,,, 81,U.S.ArmedForces–Americas[6],US military mail code,,,,AA,,,,,,,,, 82,U.S.ArmedForces–Europe[7],US military mail code,,,,AE,,,,,,,,, 83,U.S.ArmedForces–Pacific[8],US military mail code,,,,AP,,,,,,,,, 84,NorthernMarianaIslands,Obsolete postal code[9],,,,CM,,,,,,,,, -85,PanamaCanalZone,Obsolete postal code,PZ PCZ 594,,,CZ,,,,,,,,, +85,PanamaCanalZone,Obsolete postal code,PZPCZ594,,,CZ,,,,,,,,, 86,Nebraska,Obsolete postal code[10],,,,NB,,,,,,,,, -87,PhilippineIslands,Obsolete postal code,PH PHL 608[11],,,PI,,,,,,,,, -88,TrustTerritoryofthePacificIslands,Obsolete postal code,PC PCI 582,,,TT,,,,,,,,, +87,PhilippineIslands,Obsolete postal code,PHPHL608[11],,,PI,,,,,,,,, +88,TrustTerritoryofthePacificIslands,Obsolete postal code,PCPCI582,,,TT,,,,,,,,, diff --git a/q01_load_data/__pycache__/build.cpython-36.pyc b/q01_load_data/__pycache__/build.cpython-36.pyc index 40b18a9c81de8e3a1346a64fb2c3e8a15dab9878..2eaef35d26c85a48688daed4c826b8e0b0c3c572 100644 GIT binary patch delta 51 zcmX@ea*&19n3tDp%|_OxjEuJ?uV(a-5@cXt&}6yAR*;yNl2}~C1LPL*g9yRNf=mVg DWfKfs delta 51 zcmX@ea*&19n3tDp-bU7?jEqr}S2OxZ2{14)XtLa5D@e>sNh~hn0dkA@K!m_#K_&wL DP?`(N diff --git a/q02_append_row/__pycache__/build.cpython-36.pyc b/q02_append_row/__pycache__/build.cpython-36.pyc index 1aeab8fe7470a33cf58fc39823b0d5e4884911e3..72cf36e874294e1697ec6f69c601dda47f8b1e73 100644 GIT binary patch delta 27 icmeBY?PujO=H=y*Vy=#^XWGd1k&%&U@*l=YjNAZNyawC= delta 27 jcmeBY?PujO=H=x&vY{|~5%Wf_kBp3*lm9SIViW=Zal8my diff --git a/q02_append_row/build.py b/q02_append_row/build.py index 16baba0..d9a1759 100644 --- a/q02_append_row/build.py +++ b/q02_append_row/build.py @@ -15,13 +15,6 @@ def q02_append_row(path): data_set.at['Grand Total', 'Mar'] = data_set['Mar'].sum() data_set.at['Grand Total', 'total'] = data_set['total'].sum() - ''' - Approach 2 - ''' - #data_set.loc['Grand Total', 'Jan'] = data_set['Jan'].sum() - #data_set.loc['Grand Total', 'Feb'] = data_set['Feb'].sum() - #data_set.loc['Grand Total', 'Mar'] = data_set['Mar'].sum() - #data_set.loc['Grand Total', 'total'] = data_set['total'].sum() return data_set q02_append_row(path) diff --git a/q04_mapping/__pycache__/build.cpython-36.pyc b/q04_mapping/__pycache__/build.cpython-36.pyc index 7667d9463b151d46989d42a1dde32c74a0b9b0ca..5b38f4c2bcde18ee6b0dc2fb7d927225825ad144 100644 GIT binary patch literal 1140 zcma)5&2HQ_5GE-~EB~(-ImA7*zV5-2n-oD17)Fcs*ygeb3a}{zTBL2tk|>kh*xqPQ zn>XlFYZkn?LtgN`~k5& zk+88Cv3i z3}FGm1F#u*`xO`kzUH&28rU8$SybU(lgWF~0w&7N<`zeskC_g(h9{Jb?2Bsqu zx=k3DZCL3F)e3pbMot&uBR&SU8tW6)T4|h*0L(P;amU@|w`=!|JrU={xzWwN=l~yN zf5@(d7lq)C`_cV0EDcSn<@EzkW&=?8tw&>QLBfu~Ae`bfn&1hh(Jr14^8|8>gYe2+ z-j=dH3UnNW`|P&1w~NzD=Q{f;&t;q4s4LY;p?IdvMIP?*pOtlmJ}-ji#lS=nc#-cE zcS^T*I`PR`5l=edBYS5((U!&Syy@;dX**8?2R>OJbWba-T0auXw7ii%Ddz@a_Ts|G zJMMH-tF!DsCC=HJo1`rQl?stkH5H-}(l&}T1P$J`H<{^t#pCTJ z%gluZsRt_Y7jWRlzvL^YTsii{vu6_)1Tz}HpP!$fJ@)Utcdyl|f0xd_dDYRhKeVL> z0sb7moIpb~qDPviD~UX$q~RG#nw|v`MRx3Xj$YZio~y=cUJaxX)#HZOfH9L;KWbi+ z*u(+ZA}*+1kOBz>ctR+oNYUI8Sgz+U#-Zb_Z zPDuB8%5fx&Cv+%`XP60#OGOVc3pT*ycCN6!gqdtv00SRl-MNSGPsr*LlY z%YIJz?;2}@#hYi_r)f;Laq=SITX-v`ryOq|rR+tX;St>)rG##0urZwbCzF6s;%6)k z>4^Yq5)j5Z#MBFG4(dVc%3L`WLA_g&EqtZOlN((-lSfz9RYhi{KvFAE*i1rh@ ztv%MheO0=J8`{K$JF9W0sKKc9!el?3zf;s;e50%vR_GRW@LD7F*ZN%7wX@SB4ZJqg z%m&PC!p!Dk=F{_T(S(_sWvgg~D6Gx3vRyR5M_bMP?MN#e(!8aWPGN%<=7YzlM;fn_ z7HN|XS$S=g9o`6=MF)JZoIfgT@b{p+QQQbyvIdOX2yc;f(wQUKf2ffT$j(BpFXYXI z+*rt4z|J*b=jMTDM(1$^JjBC@&Q_-Z5Jkdno-zd>fb3C(^W0}Las=Y+vQ~l9=P+n; zdd@*hF6Hp-@kq^IIlXAu>Dc~Qs`5mfjcLm z(`6bKZe^I|lx?LGo=teSAE!yGh)cd7rwQL0<2Z;e9$c$Gyhi2W;Bc__$^D&K z{cz9)*p+zmFbOy%-2um(=G}DM-HRy;M)-~1ouO`bW^|!3q3QNEh0EyKgyuXK#tG$2 zGMO3j&&_d=5YLsZkE6(QB-)E0t|OWVyNVBC<(vhXFjI&YVaQ*!rwo>7qUrl1*gg)D z-1mhAlj*sz8I9AIRM?=UjC*cnoKA%`jwbo3utrguQ-LyAno&q^(xm~52#3)uf^hDw z3uD3}$cQovj9D7{89t$2O}4(gCrGF&K|mh#>gsw`lJuIRG@8VTpWvA0UNZ`E?x$lJ z;hcO z5RB+?Mmdwg3b9n$3^#O6H5D;V2xhmbNoePr$BQA_tkg*SxNt8}0WA909KrZju!jffTw-R@~buzAK zQc)H{Q(pA7b`hfLT7+_REN3`GVdnfK8N 0): - for column in head_tags: - column_names.append(column.find(text=True)) - - list_of_cells = [] - data_tags = row.findAll('td') - if(len(data_tags) > 0): - for row_data in data_tags: - row_text = row_data.text.strip() - list_of_cells.append(row_text) - i = i + 1 - #print ('Row is: ', list_of_cells) - list_of_rows.append(list_of_cells) - - outfile = open('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv', 'w', newline='') - writer = csv.writer(outfile) - for row_for_csv in list_of_rows: - if(row_for_csv.__contains__('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840')): - index_data = row_for_csv.index('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') - row_for_csv.remove('.mw-parser-output .monospaced{font-family:monospace,monospace}USUSA840') - row_for_csv.insert(index_data, 'US USA 840') - writer.writerow(row_for_csv) - outfile.flush() - outfile.close() - - df1 = pd.read_csv('D:\GreyAtom\Guided_Project\Scraped_Data\scrapeddata.csv') - df1['United States of America'] = df1['United States of America'].str.replace(' ', '') - - # % new empty columns are added to match the dataframe results - df1[''] = '' - df1[' '] = '' - df1[' '] = '' - df1[' '] = '' - df1[' '] = '' - return df1 +sys.path.append(os.path.join(os.path.dirname(os.curdir))) +from greyatomlib.pandas_guided_project.q02_append_row.build import q02_append_row +#Custom +from greyatomlib.pandas_guided_project.q03_scrape_clean.build import q03_scrape_clean def q04_mapping(path1,path2): - df_from_appended_row = q02_append_row(path1) + df_from_appended_row = q02_append_row(path1) df_from_scraped_data = q03_scrape_clean(path2) - '''Approach 1 for creating a dictionary - mapping = df_from_scraped_data.set_index('United States of America').to_dict()['U.S.'] - ''' - '''Approach 2 for creating a dictionary - ''' - mapping = dict(zip(df_from_scraped_data['United States of America'].str.lower(), df_from_scraped_data['U.S.'])) - + #Approach 1 for creating a dictionary + mapping = df_from_scraped_data.set_index('United States of America').to_dict()['U.S.'] + mapping = {k.lower(): v for k,v in mapping.items()} df_final = df_from_appended_row + #Inseting 'abbr' at column 6 - Approach 2 for this is given below df_final.insert(6, 'abbr', '') - '''Approach 1 - Advantage of this approach is if there are no matching keys in the dictioinary, then the column values are mapped to NaN - ''' - df_final['abbr'] = df_final['state'].str.lower().map(mapping) + df_final['abbr'] = df_final['state'].map(mapping) - '''Approach 2 - df_final['abbr'] = df_final['state'].str.lower().replace(mapping) - df_final.loc[df_from_appended_row['abbr'] == df_final['state'],'abbr'] = float('nan') - ''' - - '''Approach 2 for ordering columns on a specified index - Reindexing the columns can also be done using reindex_axis method. - df_final = df_final.reindex_axis(['account', 'name', 'street', 'city', 'state', 'postal-code', 'abbr', 'Jan', 'Feb', 'Mar', 'total'], axis=1) - ''' return df_final - + +path1 = 'data/excel-comp-data.xlsx' +path2 = 'https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations' +q04_mapping(path1, path2) + From 75c173b0a22fac50931318406e6f455b57555a8c Mon Sep 17 00:00:00 2001 From: ankit-a-mehta Date: Mon, 20 Aug 2018 18:08:45 +0000 Subject: [PATCH 7/7] Done --- q02_append_row/__pycache__/build.cpython-36.pyc | Bin 655 -> 655 bytes q02_append_row/build.py | 7 +++++++ 2 files changed, 7 insertions(+) diff --git a/q02_append_row/__pycache__/build.cpython-36.pyc b/q02_append_row/__pycache__/build.cpython-36.pyc index 72cf36e874294e1697ec6f69c601dda47f8b1e73..71d7fc12956f419c92da5d169a5a1f4ef6532be5 100644 GIT binary patch delta 27 jcmeBY?PujO=H=y@!(1J`h