Nicky Nicolson commited on
Commit
5aa6463
·
1 Parent(s): 7f36417

Pass in working directory as arg to script

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. tab2csv.py +12 -6
Dockerfile CHANGED
@@ -18,7 +18,7 @@ RUN ls -lh /data
18
  COPY ./tab2csv.py /code/tab2csv.py
19
 
20
 
21
- RUN python tab2csv.py --createcols ${GBIF_DOWNLOAD_ID} /data/gbifocc.csv
22
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
23
  RUN ls -l /code
24
  RUN sqlite-utils tables /code/gbifocc.db --counts
 
18
  COPY ./tab2csv.py /code/tab2csv.py
19
 
20
 
21
+ RUN python tab2csv.py --createcols /data ${GBIF_DOWNLOAD_ID} gbifocc.csv
22
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
23
  RUN ls -l /code
24
  RUN sqlite-utils tables /code/gbifocc.db --counts
tab2csv.py CHANGED
@@ -66,27 +66,33 @@ def getGbifDownloadColumnNames(download_format):
66
 
67
  if __name__ == '__main__':
68
  parser = argparse.ArgumentParser()
 
69
  parser.add_argument("download_id")
70
  parser.add_argument("-c","--createcols", action='store_true')
71
  parser.add_argument("-l","--limit", type=int)
72
- parser.add_argument("outputfile")
73
  args = parser.parse_args()
74
 
75
  # Determine format of datafile by accessing download metadata from GBIF API
76
  gbif_metadata = occ.download_meta(key = args.download_id)
77
  download_format = gbif_metadata['request']['format']
78
- inputfile = None
 
 
 
 
 
79
  column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
80
  column_names = None
81
  if download_format == 'SIMPLE_CSV':
82
- inputfile = '{}.csv'.format(args.download_id)
83
  column_names = column_names_simple_csv
84
  elif download_format == 'DWCA':
85
- inputfile = 'occurrence.txt'
86
  column_names_dwca = getGbifDownloadColumnNames('DWCA')
87
  column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
88
 
89
- df = pd.read_csv(os.path.join('data',inputfile),
90
  encoding='utf8',
91
  keep_default_na=False,
92
  on_bad_lines='skip',
@@ -103,4 +109,4 @@ if __name__ == '__main__':
103
  # Add column holding collector name and number
104
  mask = (df.recordNumber.notnull())
105
  df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
106
- df.to_csv(args.outputfile, index=False, sep=',')
 
66
 
67
  if __name__ == '__main__':
68
  parser = argparse.ArgumentParser()
69
+ parser.add_argument("data_dir")
70
  parser.add_argument("download_id")
71
  parser.add_argument("-c","--createcols", action='store_true')
72
  parser.add_argument("-l","--limit", type=int)
73
+ parser.add_argument("outputfilename")
74
  args = parser.parse_args()
75
 
76
  # Determine format of datafile by accessing download metadata from GBIF API
77
  gbif_metadata = occ.download_meta(key = args.download_id)
78
  download_format = gbif_metadata['request']['format']
79
+ # The GBIF download format determines:
80
+ # (1) the columns in the download, SIMPLE_CSV being a much restricted set
81
+ # of columns than DWCA
82
+ # (2) The name of the occurrence data file, SIMPLE_CSV : '[download_id].csv'
83
+ # DWCA : 'occurrence.txt'
84
+ inputfilename = None
85
  column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
86
  column_names = None
87
  if download_format == 'SIMPLE_CSV':
88
+ inputfilename = '{}.csv'.format(args.download_id)
89
  column_names = column_names_simple_csv
90
  elif download_format == 'DWCA':
91
+ inputfilename = 'occurrence.txt'
92
  column_names_dwca = getGbifDownloadColumnNames('DWCA')
93
  column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
94
 
95
+ df = pd.read_csv(os.path.join(args.data_dir,inputfilename),
96
  encoding='utf8',
97
  keep_default_na=False,
98
  on_bad_lines='skip',
 
109
  # Add column holding collector name and number
110
  mask = (df.recordNumber.notnull())
111
  df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
112
+ df.to_csv(os.path.join(args.data_dir,args.outputfilename), index=False, sep=',')