Nicky Nicolson commited on
Commit
2cc6a74
·
1 Parent(s): 4d53d1c

Modified name parsing to use bionomia directly

Browse files
Files changed (2) hide show
  1. Dockerfile +7 -7
  2. tab2csv.py +38 -4
Dockerfile CHANGED
@@ -18,15 +18,15 @@ RUN unzip /data/gbif-occs.zip -d /data
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
20
 
21
- # Setup to parse collector names using Bionomia utils (reqs Ruby)
22
- # Install ruby
23
- RUN \
24
- apt-get update && \
25
- apt-get install -y ruby
26
- RUN gem install dwc_agent
27
 
28
  #COPY ./extractcollectorname.py /code/extractcollectorname.py
29
- RUN python tab2csv.py --limit 1000 --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc.csv
30
  #RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
31
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
32
  RUN ls -l /code
 
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
20
 
21
+ ## Setup to parse collector names using Bionomia utils (reqs Ruby)
22
+ ## Install ruby
23
+ #RUN \
24
+ # apt-get update && \
25
+ # apt-get install -y ruby
26
+ #RUN gem install dwc_agent
27
 
28
  #COPY ./extractcollectorname.py /code/extractcollectorname.py
29
+ RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc.csv
30
  #RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
31
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
32
  RUN ls -l /code
tab2csv.py CHANGED
@@ -1,18 +1,51 @@
1
  import argparse
2
  import pandas as pd
3
- import bananompy
4
  from tqdm import tqdm
5
  tqdm.pandas()
6
 
7
- def getFirstFamilyName(s):
8
  firstFamilyName = None
9
- parsed = bananompy.parse(s)
10
  try:
11
  firstFamilyName = parsed[0]['parsed'][0]['family']
12
  except:
13
  pass
14
  return firstFamilyName
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  if __name__ == '__main__':
17
  parser = argparse.ArgumentParser()
18
  parser.add_argument("inputfile")
@@ -30,7 +63,8 @@ if __name__ == '__main__':
30
  if args.createcols:
31
  # Extract unique recordedBy values
32
  df_rb = df[['recordedBy']].drop_duplicates()
33
- df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
 
34
  # Apply back to main dataframe
35
  df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
36
  # Add column holding collector name and number
 
1
  import argparse
2
  import pandas as pd
3
+ import requests
4
  from tqdm import tqdm
5
  tqdm.pandas()
6
 
7
+ def getFirstFamilyName(recordedBy):
8
  firstFamilyName = None
9
+ parsed = bananompy.parse(recordedBy)
10
  try:
11
  firstFamilyName = parsed[0]['parsed'][0]['family']
12
  except:
13
  pass
14
  return firstFamilyName
15
 
16
+ def getFirstFamilyNames(recordedBy_l):
17
+ # post to bionomia
18
+ bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
19
+ data = dict()
20
+ data['names'] = '\r\n'.join(recordedBy_l)
21
+ r = requests.post(bionomia_parse_endpoint_url, data=data)
22
+ parsed_results = r.json()
23
+ results = dict()
24
+ for parsed_result in parsed_results:
25
+ try:
26
+ results[parsed_result['original']] = parsed_result['parsed'][0]['family']
27
+ except:
28
+ results[parsed_result['original']] = None
29
+ return results
30
+
31
+ def getFirstFamilyNameBulk(df,
32
+ recordedByColName="recordedBy",
33
+ firstFamilyNameColName="recordedBy_first_familyname",
34
+ batchsize=500):
35
+ results = dict()
36
+ recordedBy_l = []
37
+ for s in tqdm(df[recordedByColName].values):
38
+ if len(recordedBy_l) == batchsize:
39
+ # send it
40
+ results.update(getFirstFamilyNames(recordedBy_l))
41
+ # clear for next iteration
42
+ recordedBy_l = []
43
+ recordedBy_l.append(s)
44
+ if len(recordedBy_l) > 0:
45
+ results.update(getFirstFamilyNames(recordedBy_l))
46
+ df[firstFamilyNameColName] = df[recordedByColName].map(results)
47
+ return df
48
+
49
  if __name__ == '__main__':
50
  parser = argparse.ArgumentParser()
51
  parser.add_argument("inputfile")
 
63
  if args.createcols:
64
  # Extract unique recordedBy values
65
  df_rb = df[['recordedBy']].drop_duplicates()
66
+ df_rb = getFirstFamilyNameBulk(df_rb)
67
+ #df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
68
  # Apply back to main dataframe
69
  df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
70
  # Add column holding collector name and number