# Copyright 2021 United States Government as represented by the Administrator of the National Aeronautics and Space
# Administration. No copyright is claimed in the United States under Title 17, U.S. Code. All Other Rights Reserved.
r"""
This module defines the interface to the UCAC4 star catalogue.
Catalogue Description
=====================
The UCAC4 is a catalogue of 113 million stars with 105 million including proper motion solutions. It is complete to
about a magnitude of 16. It is generally very accurate for positions, but less so for magnitudes.
The UCAC 4 uses a file based database system that is fast for small queries, but can be slow for queries over a large
area. It also does not include blended stars (stars that are close enough that they appear as a single star in an
image). If you need faster retrieval and/or blended stars then you should use the :mod:`.giant_catalogue` instead,
which is built primarily using the UCAC4 catalogue.
Use
===
The UCAC4 catalogue can be used anywhere that a star catalogue is required in GIANT.
It is stored in a number of index and binary data files on disk and takes about 9 GB of space. If you attempt to
initialize the class and point it to a directory that does not contain the UCAC4 data it will ask you if you want to
download the catalogue (note that the UCAC4 data is not included by default so if you have not downloaded it yourself
you will definitely need to). If you answer yes, be aware that it may take a very long time to download.
Once you have initialized the class (and downloaded the data files), then you can access the catalogue as you would any
GIANT usable catalogue. Simply call :meth:`~.UCAC4.query_catalogue` to get the GIANT records for the stars as a
dataframe with columns according the :attr:`.GIANT_COLUMNS`. This class also provides 2 helper methods,
:meth:`~.UCAC4.query_catalogue_raw` which can be used to retrieve the raw catalogue entries (instead of the GIANT
entries) and :meth:`~.UCAC4.cross_ref_tycho` which can be used to get the raw Tycho 2 catalogue records for a UCAC4
star (if available).
"""
import os
import warnings
import copy
import time
from pathlib import Path
from datetime import datetime
from enum import Enum
from operator import lt, gt
from sqlite3 import Connection
from typing import Optional, List, Dict, TextIO, Iterable, Union, Tuple, Callable, BinaryIO, Any
import numpy as np
import pandas as pd
from giant.catalogues.meta_catalogue import GIANT_COLUMNS, GIANT_TYPES
from giant.catalogues.meta_catalogue import Catalogue
from giant.catalogues.utilities import (DEG2MAS, MAS2RAD, PARSEC2KM, STAR_DIST, DEG2RAD,
radec_distance, apply_proper_motion)
from giant.catalogues.tycho import Tycho2
from giant._typing import PATH, Real, ARRAY_LIKE
# specify which ucac cols correspond to which GIANT_COLUMNS
# noinspection SpellCheckingInspection
_UCAC_COLS: List[str] = ['ra', 'spd', 'parallax', 'pmra', 'pmdc', 'apasm_v',
'sigra', 'sigdc', 'sigpara', 'sigpmr', 'sigpmd']
"""
This specifies the names of the UCAC columns that are required in converting the a GIANT star record
"""
# specify the mapping of UCAC columns to GIANT columns
_UCAC_TO_GIANT: Dict[str, str] = dict(zip(_UCAC_COLS, GIANT_COLUMNS))
"""
This specifies the mapping of the UCAC column names to the GIANT star record column names
"""
# specify the default UCAC4 directory location
UCAC_DIR: Path = Path(__file__).resolve().parent / "data" / "UCAC4"
"""
This specifies the default location of the UCAC4 directory, which is in the data directory contained in the same
directory containing this source file.
"""
[docs]class UCAC4(Catalogue):
"""
This class provides access to the UCAC4 star catalogue.
This class is a fully functional catalogue for GIANT and can be used anywhere that GIANT expects a star catalogue.
As such, it implements the :attr:`include_proper_motion` to turn proper motion on or off as well as the method
:meth:`query_catalogue` which is how stars are queried into the GIANT format. In addition, this catalogue provides
2 additional methods :meth:`cross_ref_tycho` and :meth:`query_catalogue_raw` to get the corresponding Tycho 2 record
for the input stars or the raw UCAC4 records. These methods aren't used anywhere by GIANT itself, but may be useful
if you are doing some advanced analysis.
To use this class simply initialize it, pointing to the directory where the u4b and u4i catalogue directories are
contained. If the catalogue files do not exist it will ask you if you want to download them, and if you answer yes,
it will download the UCAC4 catalogue (which takes a long time in most instances). Once the class is initialized,
you can query stars from it using :meth:`query_catalogue` which will return a dataframe of the star records with
:attr:`.GIANT_COLUMNS` columns.
"""
def __init__(self, directory: PATH = UCAC_DIR, include_proper_motion: bool = True):
"""
:param directory: The directory containing the UCAC4 data. This should contain 2 sub directories u4i and u4b.
:param include_proper_motion: A boolean flag specifying whether to apply proper motion when retrieving the stars
"""
super().__init__(include_proper_motion=include_proper_motion)
directory = Path(directory)
if not directory.exists():
print("UCAC data not found at {}".format(directory), flush=True)
user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n"
" WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL"
" USE UP 9 GB OF SPACE!\n ")
if user_response[:1].lower() == 'y':
# make sure the directory exits
directory.mkdir(exist_ok=True, parents=True)
download_ucac(directory)
else:
raise FileNotFoundError('The UCAC data is not available in the specified directory. Cannot initialize'
'The UCAC4 class.')
self.root_directory: Path = directory
"""
The root directory containing the u4i and u4b catalogue file
"""
self.bytes_per_rec = 78
"""
The number of bytes per record in the binary files according to the UCAC4 documentation
"""
self._index_file: Path = self.root_directory / 'u4i' / 'u4index.asc'
"""
The index file to use to index into the binary files
"""
if not self._index_file.is_file():
print("UCAC index file not found at {}".format(directory), flush=True)
user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n"
" WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL"
" USE UP 9 GB OF SPACE!\n ")
if user_response[:1].lower() == 'y':
download_ucac(directory)
else:
raise FileNotFoundError("We can't find the index file for the database. This is a required file\n"
"that should be in the u4i directory in the UCAC directory. The file should\n"
"be called u4index.asc. Please locate this file and place it in the proper \n"
"directory. Please also verify the rest of your UCAC directory.")
self._zone_files: List[Path] = [(self.root_directory / 'u4b' / 'z{0:0>3d}'.format(zone))
for zone in np.arange(1, 901)]
"""
The list of zone file objects that allow reading the data
"""
for file in self._zone_files:
if not file.exists():
print("UCAC zone files not found at {}".format(directory), flush=True)
user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n"
"\tWARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL"
" USE UP 9 GB OF SPACE!\n ")
if user_response[:1].lower() == 'y':
download_ucac(directory)
else:
raise FileNotFoundError("We can't find the zone files for the database. These are required files"
"that should be in the u4b directory in the UCAC directory. The files "
"should be called Z### from 001 to 900. Please locate these files and place"
" them in the proper directory. Please also verify the rest of your UCAC "
"directory.")
break
try:
# noinspection SpellCheckingInspection
self._supplement_file: TextIO = (self.root_directory / 'u4i' / 'u4supl.dat').open('r')
"""
The supplement file object
"""
except FileNotFoundError:
print("UCAC supplement file not found at {}".format(directory), flush=True)
user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n"
" WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL"
" USE UP 9 GB OF SPACE!\n ")
if user_response[:1].lower() == 'y':
download_ucac(directory)
# noinspection SpellCheckingInspection
self._supplement_file = (self.root_directory / 'u4i' / 'u4supl.dat').open('r')
else:
# noinspection SpellCheckingInspection
raise FileNotFoundError("We can't find the supplement file for the database. This is a required file\n"
"that should be in the u4i directory in the UCAC4 directory. The file should\n"
"be called u4supl.dat. Please locate this file and place it in \n"
"the proper directory. Please also verify the rest of your directory directory."
)
try:
self._hpm_file: BinaryIO = (self.root_directory / 'u4i' / 'u4hpm.dat').open('rb')
"""
The high proper motion supplement file.
This is a text file which contains the proper motions for high proper motion stars
"""
except FileNotFoundError:
print("UCAC high proper motion file not found at {}".format(directory), flush=True)
user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n"
" WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL"
" USE UP 9 GB OF SPACE!\n ")
if user_response[:1].lower() == 'y':
download_ucac(directory)
self._hpm_file = (self.root_directory / 'u4i' / 'u4hpm.dat').open('rb')
else:
raise FileNotFoundError("We can't find the high proper motion file. This is \n"
"a required file that should be in the u4i directory in the UCAC4 directory.\n"
"The file should be called u4hpm.dat. Please locate this file and place it in\n"
"the proper directory. Please also verify the rest of your directory directory."
)
try:
# noinspection SpellCheckingInspection
self._hippo_file: BinaryIO = (self.root_directory / 'u4i' / 'hipsupl.dat').open('rb')
"""
The Hipparcos cross reference supplement file.
This contains bright stars included in the hipparcos catalogue, including parallax
"""
except FileNotFoundError:
print("UCAC hipparcos supplement file not found at {}".format(directory), flush=True)
user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n"
" WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL"
" USE UP 9 GB OF SPACE!\n ")
if user_response[:1].lower() == 'y':
download_ucac(directory)
# noinspection SpellCheckingInspection
self._hippo_file = (self.root_directory / 'u4i' / 'hipsupl.dat').open('rb')
else:
# noinspection SpellCheckingInspection
raise FileNotFoundError("We can't find the supplement file containing the hipparcos data. This is \n"
"a required file that should be in the u4i directory in the UCAC4 directory.\n"
"The file should be hipsupl.dat. Please locate this file and place it in\n"
"the proper directory. Please also verify the rest of your directory directory."
)
try:
self._tycho_cross_file: BinaryIO = (self.root_directory / 'u4i' / 'u4xtycho').open('rb')
"""
This is the Tycho 2 cross reference file, which maps UCAC4 ids to Tycho 2 ids.
"""
except FileNotFoundError:
print("UCAC tycho supplement file not found at {}".format(directory), flush=True)
user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n"
" WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL"
" USE UP 9 GB OF SPACE!\n ")
if user_response[:1].lower() == 'y':
download_ucac(directory)
self._tycho_cross_file = (self.root_directory / 'u4i' / 'u4xtycho').open('rb')
else:
warnings.warn('Unable to find the Tycho2 2 cross reference file. '
'This is required to get a true complete \n'
'catalogue. Proceeding without the tycho catalogue '
'but be aware that your results will be \n'
'incomplete for bright stars. To use the Tycho2 catalogue please locate the u4xtycho file'
'and place it in the u4i directory.')
# noinspection SpellCheckingInspection
self.cat_names: List[str] = ['ra', 'spd', 'magm', 'maga', 'sigmag', 'objt', 'cdf', # 15 bytes
'sigra', 'sigdc', 'na1', 'nu1', 'cu1', # 5 bytes
'cepra', 'cepdc', 'pmra', 'pmdc', 'sigpmr', 'sigpmd', # 10 bytes
'pts_key', 'j_m', 'h_m', 'k_m', 'icqflg_j', 'icqflg_h', 'icqflg_k', 'e2mpho_j',
'e2mpho_h', 'e2mpho_k', # 16 bytes
'apasm_b', 'apasm_v', 'apasm_g', 'apasm_r', 'apasm_i', 'apase_b', 'apase_v',
'apase_g', 'apase_r', 'apase_i', 'gcflg', # 16 bytes
'icf', # 4 bytes
'leda', 'x2m', 'rnm', 'zn2', 'rn2'] # 12 bytes
"""
This is a list of the names of the catalogue star record columns according to the UCAC4 documentation.
"""
self.cat_formats: List[str] = ['i4', 'i4', 'i2', 'i2', 'i1', 'i1', 'i1', # 15 bytes
'i1', 'i1', 'i1', 'i1', 'i1', # 5 bytes
'i2', 'i2', 'i2', 'i2', 'i1', 'i1', # 10 bytes
'i4', 'i2', 'i2', 'i2', 'i1', 'i1', 'i1', 'i1', 'i1', 'i1', # 16 bytes
'i2', 'i2', 'i2', 'i2', 'i2', 'i1', 'i1', 'i1', 'i1', 'i1', 'i1', # 16 bytes
'i4', # 4 bytes
'i1', 'i1', 'i4', 'i2', 'i4'] # 12 bytes
"""
This is a list of the types for the raw UCAC4 star record columns according to the UCAC4 documentation
The types are numpy dtype strings
"""
self.cat_dtype: np.dtype = np.dtype({'names': self.cat_names, 'formats': self.cat_formats})
"""
This is the numpy datatype that represents a record in the catalogue binary files.
This is used when accessing the binary files as memory maps.
"""
self.hpm_formats = copy.copy(self.cat_formats)
"""
This is a list of the format strings for the high proper stars.
It is the mostly same as the regular table since the stars are included in the main catalogue but need data
from an extra table.
"""
# these are the 2 columns that need a slightly different format
new_hpm_format_inds = [14, 15]
for ind in new_hpm_format_inds:
self.hpm_formats[ind] = 'i8'
self.hpm_dtype: np.dtype = np.dtype({'names': self.cat_names, 'formats': self.hpm_formats})
"""
This specifies the numpy datatype used to represent the high proper motion stars.
"""
self.index: Optional[np.ndarray] = None
"""
This contains the index into the catalogue that is used to efficiently retrieve stars.
The index is stored as a numpy structured array. It gives the number of stars for each block in each zone file.
"""
# make the index
self.build_index()
[docs] def query_catalogue(self, ids: Optional[ARRAY_LIKE] = None, min_ra: Real = 0, max_ra: Real = 360,
min_dec: Real = -90, max_dec: Real = 90, min_mag: Real = -4, max_mag: Real = 20,
search_center: Optional[ARRAY_LIKE] = None, search_radius: Optional[Real] = None,
new_epoch: Optional[Union[datetime, Real]] = None) -> pd.DataFrame:
"""
This method queries stars from the catalogue that meet specified constraints and returns them as a DataFrame
with columns of :attr:`.GIANT_COLUMNS`.
Stars can either be queried by ID directly or by right ascension/declination/magnitude. You cannot filter using
both with this method. If :attr:`apply_proper_motion` is ``True`` then this will shift the stars to the new
epoch input by the user (``new_epoch``) using proper motion.
:param ids: A sequence of star ids to retrieve from the catalogue. The ids are given by zone, rnz and should be
input as an iterable that yields tuples (therefore if you have a dataframe you should do
``df.itertuples(false)``
:param min_ra: The minimum ra bound to query stars from in degrees
:param max_ra: The maximum ra bound to query stars from in degrees
:param min_dec: The minimum declination to query stars from in degrees
:param max_dec: The maximum declination to query stars from in degrees
:param min_mag: The minimum magnitude to query stars from. Recall that magnitude is inverse (so lower
magnitude is a dimmer star)
:param max_mag: The maximum magnitude to query stars from. Recall that magnitude is inverse (so higher
magnitude is a dimmer star)
:param search_center: The center of a search cone as a ra/dec pair.
:param search_radius: The radius about the center of the search cone
:param new_epoch: The epoch to translate the stars to using proper motion if :attr:`apply_proper_motion` is
turned on
:return: A Pandas dataframe with columns :attr:`GIANT_COLUMNS`.
"""
cat_recs = self.query_catalogue_raw(ids=ids, min_ra=min_ra, max_ra=max_ra, min_dec=min_dec, max_dec=max_dec,
min_visual_mag=min_mag, max_visual_mag=max_mag,
search_center=search_center, search_radius=search_radius,
generator=False)
giant_record = self.convert_to_giant_catalogue(cat_recs)
giant_record = giant_record[(giant_record.mag <= max_mag) & (giant_record.mag >= min_mag)]
if self.include_proper_motion and (new_epoch is not None):
apply_proper_motion(giant_record, new_epoch, copy=False)
return giant_record
[docs] def query_catalogue_raw(self, ids: Optional[ARRAY_LIKE] = None, min_ra: Real = 0., max_ra: Real = 360.,
min_dec: Real = -90., max_dec: Real = 90.,
search_center: Optional[ARRAY_LIKE] = None, search_radius: Optional[Real] = None,
max_visual_mag: Real = 20., min_visual_mag: Real = -1.44,
generator: bool = False) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]:
"""
This method queries stars from the catalogue that meet specified constraints and returns them as a DataFrame
or as an iterable of dataframes where the columns are the raw catalogue files.
Stars can either be queried by ID directly or by right ascension/declination/magnitude. You cannot filter using
both with this method. This method is not usable by GIANT and it does not apply proper motion. If you need
records that are usable by GIANT and with proper motion applied see :meth:`query_catalogue`. For details on what
the columns are refer to the UCAC4 documentation (can be found online).
:param ids: A sequence of star ids to retrieve from the catalogue. The ids are given by zone, rnz and should be
input as an iterable that yields tuples (therefore if you have a dataframe you should do
``df.itertuples(false)``
:param min_ra: The minimum ra bound to query stars from in degrees
:param max_ra: The maximum ra bound to query stars from in degrees
:param min_dec: The minimum declination to query stars from in degrees
:param max_dec: The maximum declination to query stars from in degrees
:param min_visual_mag: The minimum visual magnitude to query stars from. Recall that magnitude is inverse (so
lower magnitude is a dimmer star)
:param max_visual_mag: The maximum visual magnitude to query stars from. Recall that magnitude is inverse (so
higher magnitude is a dimmer star)
:param search_center: The center of a search cone as a ra/dec pair.
:param search_radius: The radius about the center of the search cone
:param generator: A boolean flag specifying whether to return the stars as a generator (preserving memory) or to
to collect all of the stars into a single dataframe. If ``True`` then the function will yield
dataframes (which may themselves contain single or multiple stars).
:return: A (Iterable of) Pandas dataframe(s) with columns according to the catalogue columns.
"""
if ids is not None:
out = self.get_from_ids(ids)
else:
out = self._get_all_with_criteria(min_ra=min_ra, max_ra=max_ra, min_dec=min_dec, max_dec=max_dec,
search_center=search_center, search_radius=search_radius,
max_visual_mag=max_visual_mag, min_visual_mag=min_visual_mag)
if generator is False:
results = []
for res in out:
results.append(res)
if results:
return pd.concat(results)
else:
# noinspection SpellCheckingInspection
return pd.DataFrame(columns=self.cat_names + ['zone', 'rnz', 'parallax', 'sigpara'])
else:
return out
[docs] def get_from_ids(self, ids: ARRAY_LIKE) -> Iterable[pd.DataFrame]:
"""
This is a generator which returns single records for each requested ID.
``ids`` should be iterable with each element being length 2 and returning zone, rnz.
The yields will be the raw catalogue data.
:param ids: A sequence of star ids to retrieve from the catalogue. The ids are given by zone, rnz and should be
input as an iterable that yields tuples (therefore if you have a dataframe you should do
``df.itertuples(false)``
:return: An Iterable of Pandas dataframes with columns according to the catalogue columns.
"""
for zone, rnz in ids:
offset = (rnz - 1) * self.bytes_per_rec
records = pd.DataFrame.from_records(np.fromfile(self._zone_files[zone - 1], dtype=self.cat_dtype,
offset=offset, count=1).astype(self.hpm_dtype),
index='rnm')
# append new columns
records = records.assign(zone=zone, rnz=rnz, parallax=0., sigpara=0.)
yield records
def _get_all_with_criteria(self, min_ra: Real = 0., max_ra: Real = 360., min_dec: Real = -90., max_dec: Real = 90.,
search_center: Optional[ARRAY_LIKE] = None, search_radius: Optional[Real] = None,
max_visual_mag: Real = 20., min_visual_mag: Real = -1.44) -> Iterable[pd.DataFrame]:
"""
This function gets all stars meeting the criteria from the catalogue, yielding the results as DataFrames by
zone.
In general, the user should not interact with this method and instead should use :meth:`query_catalogue_raw`.
:param min_ra: The minimum ra bound to query stars from in degrees
:param max_ra: The maximum ra bound to query stars from in degrees
:param min_dec: The minimum declination to query stars from in degrees
:param max_dec: The maximum declination to query stars from in degrees
:param min_visual_mag: The minimum visual magnitude to query stars from. Recall that magnitude is inverse (so
lower magnitude is a dimmer star)
:param max_visual_mag: The maximum visual magnitude to query stars from. Recall that magnitude is inverse (so
higher magnitude is a dimmer star)
:param search_center: The center of a search cone as a ra/dec pair.
:param search_radius: The radius about the center of the search cone
:return: An Iterable of Pandas dataframes with columns according to the catalogue columns.
"""
if search_center is not None:
# determine which zone files we need to check
# find the bounds based on the specified min ra/dec and the search center stuff
min_ra = float(max(search_center[0] - search_radius, min_ra))
max_ra = float(min(search_center[0] + search_radius, max_ra))
min_dec = float(max(search_center[1] - search_radius, min_dec))
max_dec = float(min(search_center[1] + search_radius, max_dec))
if min_dec < -90.0:
# if the search region includes the south pole take all RA
min_dec = -90
min_ra = 0
max_ra = 360
elif max_dec > 90.0:
# if the search region includes the north pole take all RA
max_dec = 90
min_ra = 0
max_ra = 360
if min_ra < 0:
# if the first point of ares is included then we need to query both the high and low RA
min_ra += 360
# check the upper bounds
zone_start_a, block_start_a = self.get_zone_block(min_ra, min_dec)
# check the lower bounds
zone_start_b, block_start_b = self.get_zone_block(0, min_dec)
# determine the limiting case
zone_start = min(zone_start_a, zone_start_b)
block_start = min(block_start_a, block_start_b)
# check the upper bounds
zone_end_a, block_end_a = self.get_zone_block(360, max_dec)
# check the lower bounds
zone_end_b, block_end_b = self.get_zone_block(max_ra, max_dec)
# determine the limiting case
zone_end = max(zone_end_a, zone_end_b)
block_end = max(block_end_a, block_end_b)
elif max_ra > 360:
# if the first point of ares is included then we need to query both the high and low RA
max_ra -= 360
# check the upper bounds
zone_start_a, block_start_a = self.get_zone_block(min_ra, min_dec)
# check the lower bounds
zone_start_b, block_start_b = self.get_zone_block(0, min_dec)
# determine the limiting case
zone_start = min(zone_start_a, zone_start_b)
block_start = min(block_start_a, block_start_b)
# check the upper bounds
zone_end_a, block_end_a = self.get_zone_block(360, max_dec)
# check the lower bounds
zone_end_b, block_end_b = self.get_zone_block(max_ra, max_dec)
# determine the limiting case
zone_end = max(zone_end_a, zone_end_b)
block_end = max(block_end_a, block_end_b)
else:
# Nothing special needs to happen because we are in the ra domain
zone_start, block_start = self.get_zone_block(min_ra, min_dec)
zone_end, block_end = self.get_zone_block(max_ra, max_dec)
else:
# if we don't have to deal with the search radius stuff then just use what the user gave us
zone_start, block_start = self.get_zone_block(float(min_ra), float(min_dec))
zone_end, block_end = self.get_zone_block(float(max_ra), float(max_dec))
# walk through each zone that we need to search
for zone in np.arange(zone_start, zone_end + 1):
# Get the beginning and ending record number in the current file
start, _ = self.index[self.get_index_ind(zone, block_start)]
end_start, end_num = self.index[self.get_index_ind(zone, block_end)]
number_of_stars = end_start + end_num - start
# Get the starting byte for the first record we need from this file and seek to it
start_byte = self.bytes_per_rec * start
# read in the correct rows. First access as a memmap to conserve memory while we further limit the
# results
# noinspection PyTypeChecker
data = np.memmap(self._zone_files[zone - 1], dtype=self.cat_dtype, mode='r',
offset=start_byte, shape=(number_of_stars,))
# begin to further limit the results
if search_center is not None:
bearing_check = radec_distance(data['ra'] * MAS2RAD, data['spd'] * MAS2RAD - np.pi / 2,
search_center[0] * DEG2RAD,
search_center[1] * DEG2RAD) <= (search_radius * DEG2RAD)
else:
bearing_check = (data['ra'] >= min_ra * DEG2MAS) & (data['ra'] <= max_ra * DEG2MAS) & \
(data['spd'] >= (min_dec + 90) * DEG2MAS) & (data['spd'] <= (max_dec + 90) * DEG2MAS)
# limit the magnitude based on the APASS V magnitude if available.
# If it is not then default to the ucac magnitude but that is crappy..
mag_check = (
(data['apasm_v'] <= max_visual_mag * 1000) &
(data['apasm_v'] >= min_visual_mag * 1000)
) | (
(data['magm'] <= max_visual_mag * 1000) &
(data['magm'] >= min_visual_mag * 1000)
)
valid_data = bearing_check & mag_check
if not valid_data.any():
continue
records = pd.DataFrame.from_records(np.array(data[valid_data]).astype(self.hpm_dtype),
index='rnm')
# append new columns
records = records.assign(zone=zone, rnz=np.where(valid_data)[0] + 1 + start,
parallax=0., sigpara=0.)
# check to see if hpm stars were encountered
# noinspection SpellCheckingInspection
hpm_check = (records["pmra"] == 32767)
if hpm_check.any():
hpm_records = records[hpm_check].copy()
for rnm in hpm_records.index:
# should probably just store this in memory instead of doing a binary
hpm_line = binary_search(self._hpm_file, rnm).decode().split()
# noinspection SpellCheckingInspection
hpm_records.loc[rnm, 'pmra'] = int(hpm_line[3])
# noinspection SpellCheckingInspection
hpm_records.loc[rnm, 'pmdc'] = int(hpm_line[4])
records[hpm_check] = hpm_records
# check to see if any hipparcos stars are included
hip_flag = records['icf'].values // (10 ** 8)
hip_check = (hip_flag == 7) | (hip_flag == 9)
if hip_check.any():
running_numbers = records[hip_check].index
# get the information about each hipparcos star and add it to the data
for rnm in running_numbers:
hip_rec = binary_search(self._hippo_file, rnm).decode().split()
records.loc[rnm, 'parallax'] = float(hip_rec[8])
# noinspection SpellCheckingInspection
records.loc[rnm, 'sigpara'] = float(hip_rec[13])
yield records
[docs] def build_index(self):
"""
This method builds the in memory index into the catalogue files.
The index is stored in the :attr:`index` attribute as a numpy array. It specifies the number of stars in each
block in each zone to make it easy to seek to the right place.
"""
# noinspection SpellCheckingInspection
self.index = np.genfromtxt(self._index_file, dtype=[('start', np.uint32), ('nstars', np.uint16)],
usecols=[0, 1])
[docs] @staticmethod
def get_zone_block(ra: Real, dec: Real) -> Tuple[int, int]:
"""
This tells you the zone and block number that correspond to a specific ra dec pair
:param ra: the right ascension under consideration in units of degrees
:param dec: The declination under consideration in units of degrees
:return: a tuple containing the zone and block location for the ra/dec
"""
# zones are based on dec
zone = int(np.floor((dec + 90) / 0.2) + 1)
# blocks are based on ra
ra_ind = int(np.floor(ra / 0.25) + 1)
# need this check because ra is < 360, not <=
if ra_ind > 1440:
ra_ind = 1440
# need this check for when the max dec is 90, because this is included in the last file
if zone > 900:
zone = 900
return zone, ra_ind
[docs] @staticmethod
def get_index_ind(zone: int, ra_ind: int) -> int:
"""
This determines the location in the index that corresponds to the requested zone and block
:param zone: The zone number
:param ra_ind: The right ascension index
:return:
"""
return (zone - 1) * 1440 + ra_ind - 1
[docs] @staticmethod
def convert_to_giant_catalogue(ucac_records: pd.DataFrame) -> pd.DataFrame:
"""
This method converts records in the catalogue format into records in the GIANT format.
This is done by renaming columns and converting units.
:param ucac_records: The raw records from the catalogue as a pandas DataFrame
:return: The GIANT records as a Pandas DataFrame
"""
# prep the ucac data frame (set the full index)
ucac_records = ucac_records.assign(source='UCAC4').reset_index().rename(columns={'index': 'rnm'})
ucac_records = ucac_records.set_index(['source', 'zone', 'rnz', 'rnm'])
records = ucac_records.loc[:, _UCAC_COLS]
records.rename(columns=_UCAC_TO_GIANT, inplace=True)
records = records.assign(epoch=2000.0)
records.dtypes.loc[GIANT_COLUMNS] = GIANT_TYPES
# replace invalid magnitudes with the ucac model magnitude
invalid_mag = records['mag'] == 20000
records.loc[invalid_mag, 'mag'] = ucac_records.loc[invalid_mag, 'magm'].astype(np.float64)
# convert to giant units
records['ra'] /= DEG2MAS # MAS to DEG
records['dec'] /= DEG2MAS # MAS to DEG
records['dec'] -= 90. # SPD to DEC
records['distance_sigma'] /= records['distance'] ** 2 # convert parallax std to distance std
records['distance'] /= 1000 # MAS to arcsecond
records['distance'] **= -1 # parallax to distance (arcsecond to parsec)
records['distance'] *= PARSEC2KM # parsec to kilometers
records['mag'] /= 1000. # mMAG to MAG
records['ra_sigma'] += 128 # to uint
records['ra_sigma'] /= DEG2MAS # to deg
records['dec_sigma'] += 128 # to uint
records['dec_sigma'] /= DEG2MAS # to deg
records['ra_proper_motion'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR
records['dec_proper_motion'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR
records['ra_pm_sigma'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR
records['dec_pm_sigma'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR
records['distance_sigma'] *= 1000 # 1/MAS to parsec
records['distance_sigma'] *= PARSEC2KM # parsec to km
# convert the sigmas to J2000
# noinspection SpellCheckingInspection
ra_shift_time = 2000 - (ucac_records['cepra'] / 100 + 1900)
# noinspection SpellCheckingInspection
dec_shift_time = 2000 - (ucac_records['cepdc'] / 100 + 1900)
records['ra_sigma'] = np.sqrt(records['ra_sigma'] ** 2 + ra_shift_time ** 2 * records['ra_pm_sigma'] ** 2)
records['dec_sigma'] = np.sqrt(records['dec_sigma'] ** 2 + dec_shift_time ** 2 * records['dec_pm_sigma'] ** 2)
# fix for stars with no parallax -- The distance standard deviation seems wrong for these
default_distance_error = 20 / (STAR_DIST / PARSEC2KM / 1000) ** 2 * PARSEC2KM * 1000
records['distance_sigma'].fillna(value=default_distance_error, inplace=True)
records['distance'].replace([np.inf, -np.inf], STAR_DIST, inplace=True)
# fix for stars where the parallax is invalid
records.loc[records.distance < 0, 'distance'] = STAR_DIST
records.loc[records.distance < 0, 'distance_sigma'] = default_distance_error
# specify that the epoch of the stars is J2000
return records
[docs] def cross_ref_tycho(self, ucac_labels: ARRAY_LIKE, tycho_cat: Optional[Tycho2] = None) -> pd.DataFrame:
"""
This retrieves the Tycho 2 catalogue records for the requested UCAC4 star Zone, RNZ values.
:param tycho_cat: The tycho catalogue instance to use. If None, a default instance will be created
:param ucac_labels: The UCAC 4 labels as an iterable of Zone, RNZ pairs
:return: The raw Tycho 2 star records.
"""
if tycho_cat is None:
tycho_cat = Tycho2()
out = []
# loop through all of the UCAC stars
for label in ucac_labels:
# compose the UCAC id as included in the u4xtycho file
query = label[0] * 1e6 + label[1]
# search the u4xtycho file to see if this corresponds to a tycho star
line = binary_search(self._tycho_cross_file, query, column=1)
if line is not None:
line = line.decode()
# determine the tycho2 id from the u4xtycho file
t2id = ' '.join([line[:4], line[5:10], line[11]])
# get the tycho2 star record
rec = tycho_cat.retrieve_record(t2id)
out.append(rec)
else:
warnings.warn('The requested UCAC4 star does not have a tycho reference associated with it: {}'.format(
label
))
# todo: may need to figure out how to get a unique id here so it doesn't get overwritten
out.append(tycho_cat.nan_frame())
return pd.concat(out)
[docs] def dump_to_sqlite(self, database_connection: Connection, limiting_mag: Real = 20, use_tycho_mag: bool = False,
return_locations: bool = False, return_mag: Optional[Real] = None) -> Optional[pd.DataFrame]:
"""
Use this to write the catalogue to a sqlite3 database in the GIANT format.
You can control what stars/data are included using the key word argument inputs. You can also have this return
the star magnitude/locations for doing blending stars.
In general you should not use this directly. Instead you should use :func:`~.giant_catalogue.build_catalogue`
or script :mod:`~.scripts.build_catalogue`.
:param database_connection: The connection to the database the data is to be dumped to
:param limiting_mag: The maximum magnitude to include in the catalogue. This is based off of the APASM_V
magnitude or the UCAC4 magm magnitude, depending on which is available
:param use_tycho_mag: This flag stores the magnitude from the Tycho2 catalogue for each star that is in the
Tycho2 catalogue. Note that this will be very slow.
:param return_locations: This flag specifies to return locations for stars for doing blending
:param return_mag: This flag specifies to only return locations for stars that are brighter than this magnitude.
If ``None`` then all stars are returned.
:return: A dataframe of the dumped stars that meet the ``return_mag`` condition or ``None`` if
``return_locations`` is ``False``
"""
from .tycho import Tycho2
# if we want to use the tycho information in place of UCAC4 then create the tycho catalogue interface
if use_tycho_mag:
tycho = Tycho2()
else:
tycho = None
# list for returning the results if we are doing that
out = []
start = time.time()
print('dumping zone {}'.format(1), flush=True)
# loop through each zone file and dump it
for ind, records in enumerate(self.query_catalogue_raw(max_visual_mag=limiting_mag, generator=True)):
# convert into the GIANT format
giant_records = self.convert_to_giant_catalogue(records)
# if we are cross referencing the tycho catalogue do it
if use_tycho_mag:
tycho_recs = self.cross_ref_tycho(giant_records.index.droplevel(['source', 'rnm']).tolist(),
tycho_cat=tycho)
# add a column so we can see where the tycho information came from
giant_records.loc[:, 'tycho id'] = ''
for star_index, magnitude_index in enumerate(giant_records.index):
if not np.isnan(tycho_recs.iloc[star_index].VTmag):
# update the magnitude if we found a tycho record
giant_records.loc[magnitude_index, 'tycho id'] = '{}-{}-{}'.format(
*tycho_recs.iloc[star_index].name
)
giant_records.loc[magnitude_index, 'mag'] = tycho_recs.iloc[star_index].VTmag
# set the index to be the rnm
giant_records = giant_records.reset_index().set_index('rnm')
# dump it out to the GIANT catalogue in the stars table
giant_records.to_sql('stars', database_connection, if_exists='append')
print('zone dumped in {:.3f} secs'.format(time.time() - start), flush=True)
if return_locations:
if return_mag is not None:
out.append(giant_records.loc[giant_records.mag <= return_mag, ["ra", "dec", "mag"]])
else:
out.append(giant_records.loc[:, ["ra", "dec", "mag"]])
zone = ind + 2
start = time.time()
print('dumping zone {}'.format(zone), flush=True)
if return_locations:
return pd.concat(out)
[docs]class ColumnOrder(Enum):
"""
This enumeration specifies whether a column is sorted in ascending or descending order.
This is intended to be used as an input to :func:`binary_search`.
"""
ASCENDING = "ASCENDING"
"""
The column is sorted in ascending order (smallest first)
"""
DESCENDING = "DESCENDING"
"""
The column is sorted in DESCENDING order (smallest last)
"""
[docs]def binary_search(file: BinaryIO, label: Any, column: int = 0,
separator: Optional[str] = None, column_conversion: Callable = float,
order: Union[ColumnOrder, str] = ColumnOrder.ASCENDING,
start: int = 0, stop: Optional[int] = None, line_length: Optional[int] = None) -> Optional[bytes]:
"""
This helper function does a binary search on a sorted file with fixed width lines.
The binary search is performed by successively checking the midpoint between the current block of the file under
consideration and using it to determine whether to search to the left or right of the midpoint for the next
iteration. As such, this requires the lines to be sorted on the column that is being searched. This also requires
that the column being searched is orderable (implements comparison operators) after conversion from a string.
The conversion into an orderable type is controlled using the ``column_conversion`` keyword argument. This is
applied to the specified column (controlled by keyword arguments ``column`` and ``separator``) to create an
orderable object. This can be any callable, so long as it returns an orderable object, but typically is a python
type like ``int`` or ``float``. Note that strings are orderable as well, therefore you can make
``column_conversion`` ``str``, however be aware that the ordering of strings can be confusing when white space is
involved (for instance ``'10'`` is less than ``'2'`` according to string comparisons). Therefore, unless your
numbers are 0 padded (ie ``'02'``), we recommend using a numeric type for the ``column_conversion``.
If the searched for label is found in the column then the line in which it is found is returned (as a bytes object).
If it is not found then ``None`` is returned.
:param file: The file object to search. This should be opened in binary read mode so that we can seek
:param label: the label we are searching for in the file object. This must support equality comparison (==) with
the type that is returned by ``column_conversion``.
:param column: the column index that is to be searched
:param separator: The separator spec for splitting the file. If ``None`` then defaults to white space. This is
passed directly to ``str.split``
:param column_conversion: The callable to convert the column into an orderable object. Typically this should be one
of the python builtin types (like ``float`` or ``int``) but it can be ay callable so long
as the return supports less than/greater than operators. This is applied as
``column_conversion(line.split(sep=separator))`` where ``line`` is the current line under
consideration.
:param order: How the column being searched is sorted. This should be either ``ASCENDING`` or ``DESCENDING`` (one
of the :class:`ColumnOrder` enum values)
:param start: Where to start in the file in bytes. Typically this is unused unless you know you can skip part of
the file
:param stop: Where to stop the search in bytes. If this is ``None`` then it will be set to the length of the file.
Typically this is unused unless you know you can skip part of the file
:param line_length: The number of bytes in each line. If ``None`` then this will be computed from the file.
:return:
"""
# if the line length for the file wasn't specified determine it
if line_length is None:
file.seek(0, os.SEEK_SET)
file.readline()
line_length = file.tell()
# if the end of our search isn't specified, use the end of the file
if stop is None:
file.seek(0, os.SEEK_END)
stop = file.tell()
if isinstance(order, str):
order = ColumnOrder(order.upper())
# determine which comparison operator to use based on the sort order
if order is ColumnOrder.ASCENDING:
left_comparison = lt
else:
left_comparison = gt
while True:
# determine the number of lines in the file
stop_line = stop // line_length
# determine the line we are starting on
start_line = start // line_length
# determine the line we are searching next
mid_line = (stop_line - start_line) // 2 + start_line
mid = mid_line * line_length
# seek to the line to search
file.seek(mid, os.SEEK_SET)
line = file.readline()
# reached the end of the file, return None to indicate failure
if not line:
return None
# get the label from this line
line_lab = column_conversion(line.split(sep=separator)[column])
if line_lab == label:
# if this is the line we want return it
return line
elif mid_line == start_line or mid_line == stop_line:
# if we have run out of lines to search stop
return None
elif left_comparison(line_lab, label):
# if the search line is below the label we want then search from it to the stop
start = mid
else:
# if the search line is above the label we want then search from the start to the search line
stop = mid
[docs]def check_file(file: Path, md5s: Optional[dict] = None) -> Tuple[bool, Optional[dict]]:
"""
This checks the md5sum of a file to ensure it wasn't corrupted.
.. warning::
This function can only verify the integrity of the files through md5 sums which are insecure. While the vizier
service is trusted, use this function at your own risk
:param file: The file to check the md5 of
:param md5s: A dictionary containing the md5 sums for each file
:return: a flag specifying if the file was correct and the dictionary of md5 sums used to check the file
"""
import hashlib
if md5s is None:
md5_file = file.parent / "md5sum.txt" # type: Path
if md5_file.exists():
md5s = {}
with md5_file.open('r') as m_file_obj:
for line in m_file_obj:
if line.startswith('#'):
continue
else:
fmd5, m_file_obj = line.split()
md5s[m_file_obj.replace('*', '')] = fmd5
else:
return False, None
if file.exists():
base = file.name
if base != "md5sum.txt":
# MD5 has a security risk but its better than nothing...
return md5s.get(base, None) == hashlib.md5(file.open('rb').read()).hexdigest(), md5s # nosec
else:
return True, md5s
else:
return False, md5s
[docs]def download_ucac(target_directory: Path):
"""
This function downloads the UCAC4 catalogue from vizier to the target directory.
This is done over ftp. It requires an active internet connection that can connect to cdsarc.u-strasbg.fr
.. warning::
This download will take a long time and use up approximately 9 GB of space.
.. warning::
This download can only verify the integrity of the files through md5 sums which are insecure. While the vizier
service is trusted, use this function at your own risk
:param target_directory: the directory to save the UCAC catalogue to
"""
# we minimize the security risk here by using FTPS
import ftplib # nosec
# FTPS is secure
ftp = ftplib.FTP_TLS('cdsarc.u-strasbg.fr') # nosec
# anonymous login since we're just grabbing data
ftp.connect()
ftp.sendcmd('USER anonymous')
ftp.sendcmd('PASS anonymous@a.com')
ftp.cwd('pub/cats/I/322A/UCAC4/')
lines = []
ftp.retrlines('LIST', callback=lines.append)
for line in lines:
if line.startswith('d'):
# directory
name = line.split()[-1]
if name in ["u4b", "u4i"]:
local_directory = target_directory / name
local_directory.mkdir(parents=True, exist_ok=True)
download_lines = []
ftp.retrlines('LIST {}'.format(name), callback=download_lines.append)
md5s = None
for dl in download_lines:
start = time.time()
file = dl.split()[-1]
local = local_directory / file
downloaded, md5s = check_file(local, md5s)
if not downloaded:
with local.open('wb') as download_file:
# noinspection SpellCheckingInspection
ftp.retrbinary('RETR {}/{}'.format(name, file), download_file.write)
if name == 'u4b':
print('{} of z900 done in {:.3f}'.format(file, time.time()-start), flush=True)
else:
print('{} done in {:.3f}'.format(file, time.time()-start), flush=True)
else:
print('{} already downloaded'.format(file))