Nicky Nicolson commited on
Commit
7f36417
·
1 Parent(s): 561ae23

Modifications to allow use of DWCA format download

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. tab2csv.py +35 -2
Dockerfile CHANGED
@@ -18,7 +18,7 @@ RUN ls -lh /data
18
  COPY ./tab2csv.py /code/tab2csv.py
19
 
20
 
21
- RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc.csv
22
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
23
  RUN ls -l /code
24
  RUN sqlite-utils tables /code/gbifocc.db --counts
 
18
  COPY ./tab2csv.py /code/tab2csv.py
19
 
20
 
21
+ RUN python tab2csv.py --createcols ${GBIF_DOWNLOAD_ID} /data/gbifocc.csv
22
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
23
  RUN ls -l /code
24
  RUN sqlite-utils tables /code/gbifocc.db --counts
tab2csv.py CHANGED
@@ -1,8 +1,10 @@
1
  import argparse
2
  import pandas as pd
3
  import requests
 
4
  from tqdm import tqdm
5
  tqdm.pandas()
 
6
 
7
  def getFirstFamilyName(recordedBy):
8
  firstFamilyName = None
@@ -46,19 +48,50 @@ def getFirstFamilyNameBulk(df,
46
  df[firstFamilyNameColName] = df[recordedByColName].map(results)
47
  return df
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if __name__ == '__main__':
50
  parser = argparse.ArgumentParser()
51
- parser.add_argument("inputfile")
52
  parser.add_argument("-c","--createcols", action='store_true')
53
  parser.add_argument("-l","--limit", type=int)
54
  parser.add_argument("outputfile")
55
  args = parser.parse_args()
56
 
57
- df = pd.read_csv(args.inputfile,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  encoding='utf8',
59
  keep_default_na=False,
60
  on_bad_lines='skip',
61
  sep='\t',
 
62
  nrows=args.limit)
63
  if args.createcols:
64
  # Extract unique recordedBy values
 
1
  import argparse
2
  import pandas as pd
3
  import requests
4
+ from pygbif import occurrences as occ
5
  from tqdm import tqdm
6
  tqdm.pandas()
7
+ import os.path
8
 
9
  def getFirstFamilyName(recordedBy):
10
  firstFamilyName = None
 
48
  df[firstFamilyNameColName] = df[recordedByColName].map(results)
49
  return df
50
 
51
+ GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV = 'https://api.gbif.org/v1/occurrence/download/describe/simpleCsv'
52
+ GBIF_DOWNLOAD_DESCRIBE_URL_DWCA = 'https://api.gbif.org/v1/occurrence/download/describe/dwca'
53
+
54
+ def getGbifDownloadColumnNames(download_format):
55
+ column_names = None
56
+ if download_format == 'SIMPLE_CSV':
57
+ r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV)
58
+ columns_metadata = r.json()
59
+ column_names = [column_metadata['name'] for column_metadata in columns_metadata['fields']]
60
+ elif download_format == 'DWCA':
61
+ r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_DWCA)
62
+ columns_metadata = r.json()
63
+ column_names = [column_metadata['name'] for column_metadata in columns_metadata['verbatim']['fields']]
64
+ return column_names
65
+
66
+
67
  if __name__ == '__main__':
68
  parser = argparse.ArgumentParser()
69
+ parser.add_argument("download_id")
70
  parser.add_argument("-c","--createcols", action='store_true')
71
  parser.add_argument("-l","--limit", type=int)
72
  parser.add_argument("outputfile")
73
  args = parser.parse_args()
74
 
75
+ # Determine format of datafile by accessing download metadata from GBIF API
76
+ gbif_metadata = occ.download_meta(key = args.download_id)
77
+ download_format = gbif_metadata['request']['format']
78
+ inputfile = None
79
+ column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
80
+ column_names = None
81
+ if download_format == 'SIMPLE_CSV':
82
+ inputfile = '{}.csv'.format(args.download_id)
83
+ column_names = column_names_simple_csv
84
+ elif download_format == 'DWCA':
85
+ inputfile = 'occurrence.txt'
86
+ column_names_dwca = getGbifDownloadColumnNames('DWCA')
87
+ column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
88
+
89
+ df = pd.read_csv(os.path.join('data',inputfile),
90
  encoding='utf8',
91
  keep_default_na=False,
92
  on_bad_lines='skip',
93
  sep='\t',
94
+ usecols=column_names,
95
  nrows=args.limit)
96
  if args.createcols:
97
  # Extract unique recordedBy values