Nicky Nicolson
commited on
Commit
·
948faf7
1
Parent(s):
7a26d32
Add step to extract collector name & create col for reconciliation
Browse files- .gitignore +2 -0
- Dockerfile +2 -1
- extractcollectorname.py +36 -0
- metadata.json +1 -1
- requirements.txt +2 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
env
|
2 |
+
data
|
Dockerfile
CHANGED
@@ -17,7 +17,8 @@ RUN ls -l /data
|
|
17 |
RUN unzip /data/gbif-occs.zip -d /data
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
20 |
-
RUN python tab2csv.py /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc.csv
|
|
|
21 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
22 |
RUN ls -l /code
|
23 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
|
|
17 |
RUN unzip /data/gbif-occs.zip -d /data
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
20 |
+
RUN python tab2csv.py /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
|
21 |
+
RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
22 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
23 |
RUN ls -l /code
|
24 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
extractcollectorname.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import pandas as pd
|
3 |
+
import bananompy
|
4 |
+
from tqdm import tqdm
|
5 |
+
tqdm.pandas()
|
6 |
+
|
7 |
+
def getFirstFamilyName(s):
|
8 |
+
firstFamilyName = None
|
9 |
+
parsed = bananompy.parse(s)
|
10 |
+
try:
|
11 |
+
firstFamilyName = parsed[0]['parsed'][0]['family']
|
12 |
+
except:
|
13 |
+
pass
|
14 |
+
return firstFamilyName
|
15 |
+
|
16 |
+
if __name__ == '__main__':
|
17 |
+
parser = argparse.ArgumentParser()
|
18 |
+
parser.add_argument("inputfile")
|
19 |
+
parser.add_argument("outputfile")
|
20 |
+
args = parser.parse_args()
|
21 |
+
|
22 |
+
df = pd.read_csv(args.inputfile,
|
23 |
+
encoding='utf8',
|
24 |
+
keep_default_na=False,
|
25 |
+
na_values=['NONE',''],
|
26 |
+
on_bad_lines='skip',
|
27 |
+
sep=',')
|
28 |
+
# Extract unique recordedBy values
|
29 |
+
df_rb = df[['recordedBy']].drop_duplicates()
|
30 |
+
df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
|
31 |
+
# Apply back to main dataframe
|
32 |
+
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
33 |
+
# Add column holding collector name and number
|
34 |
+
mask = (df.recordNumber.notnull())
|
35 |
+
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
|
36 |
+
df.to_csv(args.outputfile, index=False, sep=',')
|
metadata.json
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
"plugins": {
|
8 |
"datasette-reconcile": {
|
9 |
"id_field": "gbifID",
|
10 |
-
"name_field": "
|
11 |
"type_field": "basisOfRecord",
|
12 |
"type_default": [{
|
13 |
"id": "basisOfRecord",
|
|
|
7 |
"plugins": {
|
8 |
"datasette-reconcile": {
|
9 |
"id_field": "gbifID",
|
10 |
+
"name_field": "collectorNameAndNumber",
|
11 |
"type_field": "basisOfRecord",
|
12 |
"type_default": [{
|
13 |
"id": "basisOfRecord",
|
requirements.txt
CHANGED
@@ -3,3 +3,5 @@ datasette-reconcile
|
|
3 |
sqlite-utils
|
4 |
csvs-to-sqlite
|
5 |
pandas==1.5.3
|
|
|
|
|
|
3 |
sqlite-utils
|
4 |
csvs-to-sqlite
|
5 |
pandas==1.5.3
|
6 |
+
bananompy
|
7 |
+
tqdm
|