galaxy_search/utils.py at master · georgestein/galaxy_search · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import streamlit as st
import numpy as np
import base64
import sys
import os
from urllib import request

def radec_string_to_degrees(ra_str, dec_str, ra_unit_formats, dec_unit_formats):
    """convert from weird astronomer units to useful ones (degrees)"""

    ra_err_str = "The RA entered is not in the proper form: {:s}".format(ra_unit_formats)
    dec_err_str = "The Dec entered is not in the proper form: {:s}".format(dec_unit_formats)

    if ':' in ra_str:
        try:
            HH, MM, SS = [float(i) for i in ra_str.split(':')]
        except ValueError:
            st.write(ra_err_str)
            sys.exit(ra_err_str)

        ra_str = 360./24 * (HH + MM/60 + SS/3600)

    if ':' in dec_str:
        try:
            DD, MM, SS = [float(i) for i in dec_str.split(':')]

        except ValueError:
            st.write(dec_err_str)
            sys.exit(dec_err_str)
        dec_str = DD/abs(DD) * (abs(DD) + MM/60 + SS/3600)

    try:
        ra = float(ra_str)
    except ValueError:
        st.write(ra_err_str)
        sys.exit(ra_err_str)

    try:
        dec = float(dec_str)
    except ValueError:
        st.write(dec_err_str)
        sys.exit(dec_err_str)

    return ra, dec

def angular_separation(ra1, dec1, ra2, dec2):
    """
    Angular separation between two points on a sphere.

    Parameters
    ----------
    ra1, dec1, ra2, dec2, : ra and dec in degrees

    Returns
    -------
    angular separation in degrees

    Notes
    -----
    see https://en.wikipedia.org/wiki/Great-circle_distance
    Adapted from Astropy https://github.com/astropy/astropy/blob/main/astropy/coordinates/angle_utilities.py. I am avoiding Astropy as it can be very slow
    """

    ra1 = np.radians(ra1)
    ra2 = np.radians(ra2)
    dec1 = np.radians(dec1)
    dec2 = np.radians(dec2)

    dsin_ra = np.sin(ra2 - ra1)
    dcos_ra = np.cos(ra2 - ra1)
    sin_dec1 = np.sin(dec1)
    sin_dec2 = np.sin(dec2)
    cos_dec1 = np.cos(dec1)
    cos_dec2 = np.cos(dec2)

    num1 = cos_dec2 * dsin_ra
    num2 = cos_dec1 * sin_dec2 - sin_dec1 * cos_dec2 * dcos_ra
    denominator = sin_dec1 * sin_dec2 + cos_dec1 * cos_dec2 * dcos_ra

    return np.degrees(np.arctan2(np.hypot(num1, num2), denominator))


def calculate_similarity_faiss(rep, query_ind, metric='IP', nnearest=10):
    #import faiss

    """
    Uses Facebook's Faiss library. Has the option to reduce dimensionality
    but right now it is only using simply matrix operations (dot product)
    Parameters
    ----------
    rep: representations
    Metric:
        IP: Inner Product
        L2: L2norm

    Returns
    -------
    indices of similar, similarity measure
    """
    if not isinstance(metric, str):
        sys.exit('Metric {0} must be a string'.format(metric))


    dim = rep.shape[-1] # assuming 2D array (N_rep, N_dim)

    if metric=='IP':
        index = faiss.IndexFlatIP(dim) #  distance
    elif metric=='L2':
        index = faiss.IndexFlatL2(dim) #  distance
    else:
        sys.exit('Metric {0} does not exist'.format(metric))

    # add all representations to index
    index.add(rep)

    # search for nearest instances, and return distance and indices
    dist, similar_inds = index.search(rep[query_ind][None, ...], nnearest)

    return similar_inds[0], dist[0]


def calculate_similarity(rep, query_ind, nnearest=10, similarity_inv=False):
    """
    Calculate cosine similarity of query feature vector with all image representations
    Vectors are already normalized, so cosine similarity becomes dot product

    Parameters
    ----------
    rep: array
        Image representations

    Returns
    -------
    indices of similar, similarity measure
    """

    # Search for nearest instances, and return distance and indices
    dist = rep @ rep[query_ind]

    if similarity_inv:
        # Smallest values first
        similar_inds = np.argsort(dist)
    else:
        # Largest values first
        similar_inds = np.argsort(dist)[::-1]

    dist = dist[similar_inds][:nnearest]
    similar_inds = similar_inds[:nnearest]

    return similar_inds, dist

def retrieve_similarity(query_ind, model_version='v1'):
    """
    Retreives precalculated similarity indices and distance values.

    Indices and values are saved in binary files of size (sim_chunksize, nnearest),
    of dtype=np.int32 and np.float32, respectively
    """
    sim_chunksize = 10000
    nnearest = 1000
    bytes_per_dtype = 4

    if model_version=='v1':
        model_string = '8hour_south'
    if model_version=='v2':
        model_string = '8hour_south_torgb'

    url_head = 'https://portal.nersc.gov/project/cusp/ssl_galaxy_surveys/galaxy_search/data/similarity_arrays/{:s}/small_chunks/'.format(model_string)

    ichunk = query_ind // sim_chunksize

    istart = ichunk*sim_chunksize
    iend   = (ichunk+1)*sim_chunksize
    ngal_tot = 42272646
    iend = min(iend, ngal_tot)
    url_dist = os.path.join(url_head, 'dist_knearest1000_{:09d}_{:09d}.bin'.format(istart, iend))
    url_inds = os.path.join(url_head, 'inds_knearest1000_{:09d}_{:09d}.bin'.format(istart, iend))

    query_line = query_ind % sim_chunksize

    skip_bytes = query_line*nnearest*bytes_per_dtype
    with request.urlopen(request.Request(url_dist, headers={'Range': 'bytes={:d}-'.format(skip_bytes)})) as f:
         dist = np.frombuffer(f.read(nnearest*bytes_per_dtype), dtype=np.float32)

    with request.urlopen(request.Request(url_inds, headers={'Range': 'bytes={:d}-'.format(skip_bytes)})) as f:
         similar_inds = np.frombuffer(f.read(nnearest*bytes_per_dtype), dtype=np.int32)

    return similar_inds, dist

def urls_from_coordinates(catalogue, pixscale=0.262, npix=256):
    """
    gets url for image cutout from https://www.legacysurvey.org/

    e.g. https://www.legacysurvey.org/viewer/jpeg-cutout?ra=190.1086&dec=1.2005&width=96&layer=dr-9-south&pixscale=0.262&bands=grz
    """

    urls = []
    url_head = 'https://www.legacysurvey.org/viewer/jpeg-cutout?'
    for i in range(catalogue['ra'].shape[0]):
        urls += [url_head + 'ra={:.4f}&dec={:.4f}&size={:d}&layer=ls-dr9-south&pixscale={:.3f}&bands=grz'.format(catalogue['ra'][i], catalogue['dec'][i], npix, pixscale)]

    return urls

def get_table_download_link(df):
    """
    Generates a link allowing the data in a given panda dataframe to be downloaded

    Parameters
    ----------
    df: Pandas dataframe

    Returns
    -------
    href string

    Notes
    -----
    From https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806/2

    """
    csv = df.to_csv(index=True)
    b64 = base64.b64encode(csv.encode()).decode()  # some strings <-> bytes conversions necessary here
    href = f'<a href="data:file/csv;base64,{b64}" download="most_similar_galaxies.csv">Download table as csv file</a>'

    return href