-
Notifications
You must be signed in to change notification settings - Fork 29
Open
Description
- occupationcoder version: 0.2.0
- Python version: 3.6
- Operating System: CentOS
Description
Hi @aeturrell ! I am getting a bug on occupationcoder/utilities/utilities.py line 319
def getKey(item): """ Takes any iterable as input. Returns tuple. Is used to specify order of argument importance, which can be used for sorting and gettin max using multiple criteria. >>> getKey(('registered nurse', 90, 4)) (90, 4, 2) """ return (item[1], item[2], len(item[0].split()))
I get tuple index problems, but code is apparently ok. Full Traceback is below.
The problem seems to be in a specific data file ( I am looping over several), but looking over the file structure, variable class, string text etc, there is no apparent difference from other files. The code I am running is below, any guesses of where it might be coming from? Thanks!!!
What I Did
#!/usr/bin/python36
import pandas as pd
from occupationcoder.coder import coder
import os.path
import re
import os
from os import walk
import glob
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
dir = '/home/ALL/'
dir1 = '/home/
myCoder = coder.Coder()
d0 = [[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]
df0 = pd.DataFrame(data=d0, columns=[ 'year', 'month', 'day', 'category_id', 'company', 'date_created','location_raw', 'salary_max', 'salary_min', 'salary_predicted',
'salary_currency', 'salary_raw', 'title', 'category_name', 'location_path', 'contract_time',
'contract_type', 'company_id', 'company_name', 'description'] )
df0.to_csv(os.path.join(dir1, 'vacancy_stock_raw.csv'))
df0=pd.read_csv(os.path.join(dir1, 'vacancy_stock_raw.csv'))
#loop over folders and append selected variables into vacancy_stock_raw.csv
def main():
df2=df0
li=[]
r = []
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
for name in files:
filepath = root + os.sep + name
if filepath.endswith(".csv"):
print(filepath)
d = pd.read_csv(filepath, index_col=None, header=0)
d = d.rename(columns={'title': 'job_title'})
d = d.rename(columns={'description': 'job_description'})
d = d.rename(columns={'category_name': 'job_sector'})
d=d.astype(str)
d['job_title'] = d['job_title'].str.slice(0, 100)
d['job_sector'] = d['job_sector'].str.slice(0, 100)
d['job_description'] = d['job_description'].str.slice(0, 200)
d = d[[ 'year', 'month', 'day', 'category_id', 'company', 'date_created','location_raw', 'salary_max', 'salary_min', 'salary_predicted',
'salary_currency', 'job_title', 'job_sector', 'location_path', 'contract_time',
'contract_type', 'company_id', 'company_name', 'job_description']]
d['titleno_space'] = d['job_title']
print('ready')
df1=myCoder.codedataframe(d)
print('socdone')
df1 = df1[[ 'year', 'month', 'day', 'category_id', 'company', 'date_created','location_raw', 'salary_max', 'salary_min', 'salary_predicted',
'salary_currency', 'job_title', 'job_sector', 'location_path', 'contract_time',
'contract_type', 'company_id', 'company_name', 'SOC_code']]
df2=df2.append(df1)
df2.to_csv(os.path.join(dir1, 'vacancy_stock_raw.csv'))
if __name__ == '__main__':
main()
WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '–']
Traceback (most recent call last):
File "./1adzuna.py", line 67, in <module>
main()
File "./1adzuna.py", line 57, in main
df1=myCoder.codedataframe(d)
File "/home/cc18002/.local/lib/python3.6/site-packages/occupationcoder/coder/coder.py", line 104, in codedataframe
x = res.compute(scheduler='processes')
File "/usr/local/lib/python3.6/site-packages/dask/base.py", line 156, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/usr/local/lib/python3.6/site-packages/dask/base.py", line 397, in compute
results = schedule(dsk, keys, **kwargs)
File "/usr/local/lib/python3.6/site-packages/dask/multiprocessing.py", line 192, in get
raise_exception=reraise, **kwargs)
File "/usr/local/lib/python3.6/site-packages/dask/local.py", line 501, in get_async
raise_exception(exc, tb)
File "/usr/local/lib/python3.6/site-packages/dask/compatibility.py", line 111, in reraise
raise exc.with_traceback(tb)
File "/usr/local/lib/python3.6/site-packages/dask/local.py", line 272, in execute_task
result = _execute_task(task, data)
File "/usr/local/lib/python3.6/site-packages/dask/local.py", line 253, in _execute_task
return func(*args2)
File "/usr/local/lib/python3.6/site-packages/dask/dataframe/core.py", line 3684, in apply_and_enforce
df = func(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/dask/utils.py", line 694, in __call__
return getattr(obj, self.method)(*args, **kwargs)
File "/home/cc18002/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 6928, in apply
return op.get_result()
File "/home/cc18002/.local/lib/python3.6/site-packages/pandas/core/apply.py", line 186, in get_result
return self.apply_standard()
File "/home/cc18002/.local/lib/python3.6/site-packages/pandas/core/apply.py", line 292, in apply_standard
self.apply_series_generator()
File "/home/cc18002/.local/lib/python3.6/site-packages/pandas/core/apply.py", line 321, in apply_series_generator
results[i] = self.f(v)
File "/home/cc18002/.local/lib/python3.6/site-packages/occupationcoder/utilities/utilities.py", line 392, in return_best_match_2
final_code = max(items(), key=lambda x: getKey(x[1]))
File "/home/cc18002/.local/lib/python3.6/site-packages/occupationcoder/utilities/utilities.py", line 392, in <lambda>
final_code = max(items(), key=lambda x: getKey(x[1]))
File "/home/cc18002/.local/lib/python3.6/site-packages/occupationcoder/utilities/utilities.py", line 319, in getKey
return (item[1], item[2], len(item[0].split()))
IndexError: ('tuple index out of range', 'occurred at index 706')
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels