# Demo of the KAILAS UAT labeler capabilities
This notebooks shows how to use KAILAS to automatically tag text with Unified Astronomy Thesaurus concepts

## Preliminaries
1. load UAT concepts
2. load a dataset

In [1]:
# We need to build version of the UAT that is more suited to our needs
# Download the original here: https://github.com/astrothesaurus/UAT/blob/master/UAT.json
# and replace the path to it below
# build the UAT labels dict
import json
with open('../data/UAT/UAT_list.json', 'r') as f:
 uat_list = json.load(f)

# build the dict that matches UAT ID (numbers) to common names
uat_names = {}
for entry in uat_list:
 uat_id = entry['uri'].split('/')[-1]
 uat_names[uat_id] = entry['name'].lower().strip()

# sort by key
uat_names = dict(sorted(uat_names.items()))

In [None]:
# Load the open dataset
from datasets import load_dataset
uat_dataset = load_dataset('adsabs/SciX_UAT_keywords')

In [3]:
# Hugginface datasets can be interface both by rows (int) or columns (str)
uat_dataset

DatasetDict({
 val: Dataset({
 features: ['bibcode', 'title', 'abstract', 'verified_uat_ids', 'verified_uat_labels'],
 num_rows: 3025
 })
 train: Dataset({
 features: ['bibcode', 'title', 'abstract', 'verified_uat_ids', 'verified_uat_labels'],
 num_rows: 18677
 })
})

## Main Demo
1. create the prediction pipeline
2. make your predictions
3. format predictions for readability

In [None]:
# create the pipeline

from transformers import pipeline, AutoTokenizer

model_path = 'adsabs/KAILAS'
revision = None

# sentiment-analysis means loading ModelForSequenceClassification
pipe = pipeline(task='sentiment-analysis',
 model=model_path,
 tokenizer=AutoTokenizer.from_pretrained(model_path, 
 model_max_length=512, 
 do_lower_case=False,
 ),
 revision=revision,
 num_workers=1,
 batch_size=32,
 return_all_scores=True,
 truncation=True,
 )

In [5]:
# custom top_k function 
import heapq

def top_k_scores(scores, k):
 return(heapq.nlargest(k, scores, key=lambda x: x['score']) )

In [6]:
# MAIN DEMO 
# pick some samples from our dataset
# this is a list of strings

num_pred = 3
start = 510

temp_dataset = uat_dataset['val'][start:start+num_pred]
sentences = [str(t)+' '+str(a) for t,a in zip(temp_dataset['title'],
 temp_dataset['abstract'])
 if t
 ]

# make predictions
all_sentence_scores = pipe(sentences)

# we need to change the output of the model to strings to make it compatible with the next version.
# it's best to think of the outputs as labels anyways, not as integers
all_sentence_scores = [[{'label':str(s['label']), 'score':s['score']} for s in sample_scores] for sample_scores in all_sentence_scores]

# format for readability, and show top k scores
threshold = 0.15
top_sentence_scores = [[ {'label':uat_names[l['label']], 'score':l['score']} 
 for l in top_k_scores(s, k=1000) if l['score']>=threshold] 
 for s in all_sentence_scores]

next_sentence_scores = [[ {'label':uat_names[l['label']], 'score':l['score']} 
 for l in top_k_scores(s, k=1000) if l['score']<=threshold and l['score']>=0.01*threshold] 
 for s in all_sentence_scores]

for i in range(min(10,num_pred)):
 print('BIBCODE:', temp_dataset['bibcode'][i])
 print('\tTITLE:', temp_dataset['title'][i])
 print('\tABSTRACT:', temp_dataset['abstract'][i])
 print()
 print('AUTHOR ASSIGNED:', temp_dataset['verified_uat_labels'][i])

 if len(top_sentence_scores[i])>0:
 print('MODEL ASSIGNED:', [(x['label'], '{:.4f}'.format(x['score'])) for x in top_sentence_scores[i]] )
 print()
 print('NEXT SCORES', [(x['label'], '{:.4f}'.format(x['score'])) for x in next_sentence_scores[i]] )
 
 print() 
 print()


BIBCODE: 2022ApJ...933..110X
	TITLE: Spatially Resolved Ionized Outflows Extending to 2 kpc in Seyfert 1 Galaxy NGC 7469 Revealed by the Very Large Telescope/MUSE
	ABSTRACT: The Seyfert 1 galaxy NGC 7469 possesses a prominent nuclear starburst ring and a luminous active galactic nucleus (AGN). Evidence of an outflow in the innermost nuclear region has been found in previous works. We detect the ionized gas outflow on a larger scale in the galaxy using the archival Very Large Telescope/MUSE and Chandra observations. The optical emission lines are modeled using two Gaussian components, and a nonparametric approach is applied to measure the kinematics of [O III] and Hα emitting gas. Line ratio diagnostics and spatially resolved maps are derived to examine the origin of the outflow. The kiloparsec-scale kinematics of [O III] are dominated by a blueshifted component whereas the velocity map of Hα shows a rotational disk with a complex nonrotational substructure. The starburst wind around th

In [7]:
# Note: truncation is in effect and long sentences will only take into account the first 512 tokens
sentences = [' '.join(['1' for i in range(j) ]) for j in range(505,515)]
sentence_scores = pipe(sentences)

In [8]:
[(len(sent.split()), top_k_scores(scores, k=1)) for sent, scores in zip(sentences,sentence_scores)]

[(505, [{'label': 2189, 'score': 0.8530406355857849}]),
 (506, [{'label': 2189, 'score': 0.8583861589431763}]),
 (507, [{'label': 2189, 'score': 0.8545922040939331}]),
 (508, [{'label': 2189, 'score': 0.8484249114990234}]),
 (509, [{'label': 2189, 'score': 0.8524807095527649}]),
 (510, [{'label': 2189, 'score': 0.8559486269950867}]),
 (511, [{'label': 2189, 'score': 0.8559486865997314}]),
 (512, [{'label': 2189, 'score': 0.8559486269950867}]),
 (513, [{'label': 2189, 'score': 0.8559486269950867}]),
 (514, [{'label': 2189, 'score': 0.8559486865997314}])]

In [9]:
# Demo with manually input astro sentences
sentences = ['This work discusses a junction-less nanowire tunnel field effect transistor (JLN-TFET) that combines the advantages of a junction-less field effect transistor (JLFET) and a tunnel field effect transistor (TFET). With a hetero-structure device made of silicon (Si) and germanium (Ge), an amalgamation of gate engineering and channel engineering is investigated. To eliminate junctions in the structure, a uniformly high dosage of doping (1019cm-3) has been employed throughout. In contrast to the source work function, which is set at 5.93 eV, the gate work function is set at 4.5 eV. When compared to junction less nanowire tunnel FET (JLTFET), the modified gate-all-around hetero junction less nanowire tunnel field effect transistor (GAA-H-JLNTFET) performs better. The proposed structure GAA-H-JLNTFET exhibits the ON current (ION) 6.5 × 10−5µA/m, the off current (IOFF) measures 2.97 × 10−20µA/m, the subthreshold slope (SS) is 12mV/Dec, and ION/IOFFis≈1015which makes them immune to short channel effect and suitable for low power application in Nano regime. Further, in this work, the proposed structure is utilized to implement the dielectric modulated low-power biosensor. The drain current is taken as the sensitivity parameter. Five different biomolecules sensitivity are measured and found better than the previous published results. For the simulation and analysis, the Silvaco Atlas 2D simulator with non-local band-to-band tunneling is used. ',
 'We report observations from the Hubble Space Telescope (HST) of Cepheid variables in the host galaxies of 42 Type Ia supernovae (SNe Ia) used to calibrate the Hubble constant (H 0). These include the complete sample of all suitable SNe Ia discovered in the last four decades at redshift z ≤ 0.01, collected and calibrated from ≥1000 HST orbits, more than doubling the sample whose size limits the precision of the direct determination of H 0. The Cepheids are calibrated geometrically from Gaia EDR3 parallaxes, masers in NGC 4258 (here tripling that sample of Cepheids), and detached eclipsing binaries in the Large Magellanic Cloud. All Cepheids in these anchors and SN Ia hosts were measured with the same instrument (WFC3) and filters (F555W, F814W, F160W) to negate zero-point errors. We present multiple verifications of Cepheid photometry and six tests of background determinations that show Cepheid measurements are accurate in the presence of crowded backgrounds. The SNe Ia in these hosts calibrate the magnitude-redshift relation from the revised Pantheon+ compilation, accounting here for covariance between all SN data and with host properties and SN surveys matched throughout to negate systematics. We decrease the uncertainty in the local determination of H 0 to 1 km s-1 Mpc-1 including systematics. We present results for a comprehensive set of nearly 70 analysis variants to explore the sensitivity of H 0 to selections of anchors, SN surveys, redshift ranges, the treatment of Cepheid dust, metallicity, form of the period-luminosity relation, SN color, peculiar-velocity corrections, sample bifurcations, and simultaneous measurement of the expansion history. Our baseline result from the Cepheid-SN Ia sample is H 0 = 73.04 ± 1.04 km s-1 Mpc-1, which includes systematic uncertainties and lies near the median of all analysis variants. We demonstrate consistency with measures from HST of the TRGB between SN Ia hosts and NGC 4258, and include them simultaneously to yield 72.53 ± 0.99 km s-1 Mpc-1. The inclusion of high-redshift SNe Ia yields H 0 = 73.30 ± 1.04 km s-1 Mpc-1 and q 0 = -0.51 ± 0.024. We find a 5σ difference with the prediction of H 0 from Planck cosmic microwave background observations under ΛCDM, with no indication that the discrepancy arises from measurement uncertainties or analysis variations considered to date. The source of this now long-standing discrepancy between direct and cosmological routes to determining H 0 remains unknown.',
 'We use archival COBE/DIRBE data to construct a map of polycyclic aromatic hydrocarbon (PAH) emission in the λ-Orionis region. The presence of the 3.3 μm PAH feature within the DIRBE 3.5 μm band and the corresponding lack of significant PAH spectral features in the adjacent DIRBE bands (1.25, 2.2, and 4.9 μm) enable estimation of the PAH contribution to the 3.5 μm data. Having the shortest wavelength of known PAH features, the 3.3 μm feature probes the smallest PAHs, which are also the leading candidates for carriers of anomalous microwave emission (AME). We use this map to investigate the association between the AME and the emission from PAH molecules. We find that the spatial correlation in λ-Orionis is higher between AME and far-infrared dust emission (as represented by the DIRBE 240 μm map) than it is between our PAH map and AME. This finding, in agreement with previous studies using PAH features at longer wavelengths, is in tension with the hypothesis that AME is due to spinning PAHs. However, the expected correlation between mid-infrared and microwave emission could potentially be degraded by different sensitivities of each emission mechanism to local environmental conditions even if PAHs are the carriers of both.',
 'THis is a noew sentence with typoes and not really about astro anyways',
 ]


all_sentence_scores = pipe(sentences)
# again convert to strings, to future-proof
all_sentence_scores = [[{'label':str(s['label']), 'score':s['score']} for s in sample_scores] for sample_scores in all_sentence_scores]

top_sentence_scores = [[ {'label':uat_names[l['label']], 'score':l['score']} 
 for l in top_k_scores(s, k=3)] 
 for s in all_sentence_scores]

In [10]:
top_sentence_scores

[[{'label': 'astronomical instrumentation', 'score': 0.9473740458488464},
 {'label': 'astronomy data modeling', 'score': 0.49145984649658203},
 {'label': 'space vehicle instruments', 'score': 0.39475584030151367}],
 [{'label': 'hubble constant', 'score': 0.993058443069458},
 {'label': 'cosmology', 'score': 0.006782000884413719},
 {'label': 'planetary nebulae', 'score': 0.001997443614527583}],
 [{'label': 'interstellar dust', 'score': 0.9986469149589539},
 {'label': 'polycyclic aromatic hydrocarbons', 'score': 0.99810791015625},
 {'label': 'interstellar medium', 'score': 0.9940189123153687}],
 [{'label': 'time series analysis', 'score': 0.2862895429134369},
 {'label': 'astronomy data analysis', 'score': 0.08515171706676483},
 {'label': 'optical telescopes', 'score': 0.04666740819811821}]]

In [11]:
# the score for every label is available
all_sentence_scores[0][0:10]

[{'label': '0', 'score': 4.918351805827115e-07},
 {'label': '2', 'score': 2.4442993407092217e-08},
 {'label': '3', 'score': 1.2135334372942452e-06},
 {'label': '4', 'score': 7.317441941268044e-08},
 {'label': '5', 'score': 1.078589502867544e-05},
 {'label': '6', 'score': 2.913926877567974e-08},
 {'label': '7', 'score': 2.268499343927033e-07},
 {'label': '8', 'score': 2.4015573529823087e-08},
 {'label': '9', 'score': 2.046675717792823e-08},
 {'label': '10', 'score': 1.2331685006472526e-08}]

In [12]:
all_sentence_scores[0][-10:]

[{'label': '2363', 'score': 7.282670910768729e-09},
 {'label': '2364', 'score': 3.004765858349856e-07},
 {'label': '2365', 'score': 1.247675868398801e-06},
 {'label': '2366', 'score': 5.6775856904778266e-08},
 {'label': '2367', 'score': 3.278066174061678e-08},
 {'label': '2368', 'score': 1.39621681682911e-06},
 {'label': '2369', 'score': 0.00028976111207157373},
 {'label': '2370', 'score': 1.9995159163954668e-06},
 {'label': '2371', 'score': 2.577130089775892e-06},
 {'label': '2372', 'score': 3.240722179498334e-08}]