Source code for giant.catalogues.ucac

# Copyright 2021 United States Government as represented by the Administrator of the National Aeronautics and Space
# Administration.  No copyright is claimed in the United States under Title 17, U.S. Code. All Other Rights Reserved.


r"""
This module defines the interface to the UCAC4 star catalogue.

Catalogue Description
=====================

The UCAC4 is a catalogue of 113 million stars with 105 million including proper motion solutions.  It is complete to
about a magnitude of 16.  It is generally very accurate for positions, but less so for magnitudes.

The UCAC 4 uses a file based database system that is fast for small queries, but can be slow for queries over a large
area.  It also does not include blended stars (stars that are close enough that they appear as a single star in an
image).  If you need faster retrieval and/or blended stars then you should use the :mod:`.giant_catalogue` instead,
which is built primarily using the UCAC4 catalogue.

Use
===

The UCAC4 catalogue can be used anywhere that a star catalogue is required in GIANT.
It is stored in a number of index and binary data files on disk and takes about 9 GB of space.  If you attempt to
initialize the class and point it to a directory that does not contain the UCAC4 data it will ask you if you want to
download the catalogue (note that the UCAC4 data is not included by default so if you have not downloaded it yourself
you will definitely need to).  If you answer yes, be aware that it may take a very long time to download.

Once you have initialized the class (and downloaded the data files), then you can access the catalogue as you would any
GIANT usable catalogue.  Simply call :meth:`~.UCAC4.query_catalogue` to get the GIANT records for the stars as a
dataframe with columns according the :attr:`.GIANT_COLUMNS`.  This class also provides 2 helper methods,
:meth:`~.UCAC4.query_catalogue_raw` which can be used to retrieve the raw catalogue entries (instead of the GIANT
entries) and :meth:`~.UCAC4.cross_ref_tycho` which can be used to get the raw Tycho 2 catalogue records for a UCAC4
star (if available).
"""

import os
import warnings
import copy
import time
from pathlib import Path
from datetime import datetime

from enum import Enum

from operator import lt, gt

from sqlite3 import Connection


from typing import Optional, List, Dict, TextIO, Iterable, Union, Tuple, Callable, BinaryIO, Any

import numpy as np

import pandas as pd

from giant.catalogues.meta_catalogue import GIANT_COLUMNS, GIANT_TYPES
from giant.catalogues.meta_catalogue import Catalogue
from giant.catalogues.utilities import (DEG2MAS, MAS2RAD, PARSEC2KM, STAR_DIST, DEG2RAD,
                                        radec_distance, apply_proper_motion)
from giant.catalogues.tycho import Tycho2

from giant._typing import PATH, Real, ARRAY_LIKE


# specify which ucac cols correspond to which GIANT_COLUMNS
# noinspection SpellCheckingInspection
_UCAC_COLS: List[str] = ['ra', 'spd', 'parallax', 'pmra', 'pmdc', 'apasm_v',
                         'sigra', 'sigdc', 'sigpara', 'sigpmr', 'sigpmd']
"""
This specifies the names of the UCAC columns that are required in converting the a GIANT star record
"""

# specify the mapping of UCAC columns to GIANT columns
_UCAC_TO_GIANT: Dict[str, str] = dict(zip(_UCAC_COLS, GIANT_COLUMNS))
"""
This specifies the mapping of the UCAC column names to the GIANT star record column names
"""

# specify the default UCAC4 directory location
UCAC_DIR: Path = Path(__file__).resolve().parent / "data" / "UCAC4"
"""
This specifies the default location of the UCAC4 directory, which is in the data directory contained in the same 
directory containing this source file.
"""


[docs]class UCAC4(Catalogue): """ This class provides access to the UCAC4 star catalogue. This class is a fully functional catalogue for GIANT and can be used anywhere that GIANT expects a star catalogue. As such, it implements the :attr:`include_proper_motion` to turn proper motion on or off as well as the method :meth:`query_catalogue` which is how stars are queried into the GIANT format. In addition, this catalogue provides 2 additional methods :meth:`cross_ref_tycho` and :meth:`query_catalogue_raw` to get the corresponding Tycho 2 record for the input stars or the raw UCAC4 records. These methods aren't used anywhere by GIANT itself, but may be useful if you are doing some advanced analysis. To use this class simply initialize it, pointing to the directory where the u4b and u4i catalogue directories are contained. If the catalogue files do not exist it will ask you if you want to download them, and if you answer yes, it will download the UCAC4 catalogue (which takes a long time in most instances). Once the class is initialized, you can query stars from it using :meth:`query_catalogue` which will return a dataframe of the star records with :attr:`.GIANT_COLUMNS` columns. """ def __init__(self, directory: PATH = UCAC_DIR, include_proper_motion: bool = True): """ :param directory: The directory containing the UCAC4 data. This should contain 2 sub directories u4i and u4b. :param include_proper_motion: A boolean flag specifying whether to apply proper motion when retrieving the stars """ super().__init__(include_proper_motion=include_proper_motion) directory = Path(directory) if not directory.exists(): print("UCAC data not found at {}".format(directory), flush=True) user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n" " WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL" " USE UP 9 GB OF SPACE!\n ") if user_response[:1].lower() == 'y': # make sure the directory exits directory.mkdir(exist_ok=True, parents=True) download_ucac(directory) else: raise FileNotFoundError('The UCAC data is not available in the specified directory. Cannot initialize' 'The UCAC4 class.') self.root_directory: Path = directory """ The root directory containing the u4i and u4b catalogue file """ self.bytes_per_rec = 78 """ The number of bytes per record in the binary files according to the UCAC4 documentation """ self._index_file: Path = self.root_directory / 'u4i' / 'u4index.asc' """ The index file to use to index into the binary files """ if not self._index_file.is_file(): print("UCAC index file not found at {}".format(directory), flush=True) user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n" " WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL" " USE UP 9 GB OF SPACE!\n ") if user_response[:1].lower() == 'y': download_ucac(directory) else: raise FileNotFoundError("We can't find the index file for the database. This is a required file\n" "that should be in the u4i directory in the UCAC directory. The file should\n" "be called u4index.asc. Please locate this file and place it in the proper \n" "directory. Please also verify the rest of your UCAC directory.") self._zone_files: List[Path] = [(self.root_directory / 'u4b' / 'z{0:0>3d}'.format(zone)) for zone in np.arange(1, 901)] """ The list of zone file objects that allow reading the data """ for file in self._zone_files: if not file.exists(): print("UCAC zone files not found at {}".format(directory), flush=True) user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n" "\tWARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL" " USE UP 9 GB OF SPACE!\n ") if user_response[:1].lower() == 'y': download_ucac(directory) else: raise FileNotFoundError("We can't find the zone files for the database. These are required files" "that should be in the u4b directory in the UCAC directory. The files " "should be called Z### from 001 to 900. Please locate these files and place" " them in the proper directory. Please also verify the rest of your UCAC " "directory.") break try: # noinspection SpellCheckingInspection self._supplement_file: TextIO = (self.root_directory / 'u4i' / 'u4supl.dat').open('r') """ The supplement file object """ except FileNotFoundError: print("UCAC supplement file not found at {}".format(directory), flush=True) user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n" " WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL" " USE UP 9 GB OF SPACE!\n ") if user_response[:1].lower() == 'y': download_ucac(directory) # noinspection SpellCheckingInspection self._supplement_file = (self.root_directory / 'u4i' / 'u4supl.dat').open('r') else: # noinspection SpellCheckingInspection raise FileNotFoundError("We can't find the supplement file for the database. This is a required file\n" "that should be in the u4i directory in the UCAC4 directory. The file should\n" "be called u4supl.dat. Please locate this file and place it in \n" "the proper directory. Please also verify the rest of your directory directory." ) try: self._hpm_file: BinaryIO = (self.root_directory / 'u4i' / 'u4hpm.dat').open('rb') """ The high proper motion supplement file. This is a text file which contains the proper motions for high proper motion stars """ except FileNotFoundError: print("UCAC high proper motion file not found at {}".format(directory), flush=True) user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n" " WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL" " USE UP 9 GB OF SPACE!\n ") if user_response[:1].lower() == 'y': download_ucac(directory) self._hpm_file = (self.root_directory / 'u4i' / 'u4hpm.dat').open('rb') else: raise FileNotFoundError("We can't find the high proper motion file. This is \n" "a required file that should be in the u4i directory in the UCAC4 directory.\n" "The file should be called u4hpm.dat. Please locate this file and place it in\n" "the proper directory. Please also verify the rest of your directory directory." ) try: # noinspection SpellCheckingInspection self._hippo_file: BinaryIO = (self.root_directory / 'u4i' / 'hipsupl.dat').open('rb') """ The Hipparcos cross reference supplement file. This contains bright stars included in the hipparcos catalogue, including parallax """ except FileNotFoundError: print("UCAC hipparcos supplement file not found at {}".format(directory), flush=True) user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n" " WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL" " USE UP 9 GB OF SPACE!\n ") if user_response[:1].lower() == 'y': download_ucac(directory) # noinspection SpellCheckingInspection self._hippo_file = (self.root_directory / 'u4i' / 'hipsupl.dat').open('rb') else: # noinspection SpellCheckingInspection raise FileNotFoundError("We can't find the supplement file containing the hipparcos data. This is \n" "a required file that should be in the u4i directory in the UCAC4 directory.\n" "The file should be hipsupl.dat. Please locate this file and place it in\n" "the proper directory. Please also verify the rest of your directory directory." ) try: self._tycho_cross_file: BinaryIO = (self.root_directory / 'u4i' / 'u4xtycho').open('rb') """ This is the Tycho 2 cross reference file, which maps UCAC4 ids to Tycho 2 ids. """ except FileNotFoundError: print("UCAC tycho supplement file not found at {}".format(directory), flush=True) user_response = input("Would you like to download the UCAC data to this directory (y/n)?\n" " WARNING: THIS REQUIRES AN INTERNET CONNECTION, WILL TAKE A LONG TIME, AND WILL" " USE UP 9 GB OF SPACE!\n ") if user_response[:1].lower() == 'y': download_ucac(directory) self._tycho_cross_file = (self.root_directory / 'u4i' / 'u4xtycho').open('rb') else: warnings.warn('Unable to find the Tycho2 2 cross reference file. ' 'This is required to get a true complete \n' 'catalogue. Proceeding without the tycho catalogue ' 'but be aware that your results will be \n' 'incomplete for bright stars. To use the Tycho2 catalogue please locate the u4xtycho file' 'and place it in the u4i directory.') # noinspection SpellCheckingInspection self.cat_names: List[str] = ['ra', 'spd', 'magm', 'maga', 'sigmag', 'objt', 'cdf', # 15 bytes 'sigra', 'sigdc', 'na1', 'nu1', 'cu1', # 5 bytes 'cepra', 'cepdc', 'pmra', 'pmdc', 'sigpmr', 'sigpmd', # 10 bytes 'pts_key', 'j_m', 'h_m', 'k_m', 'icqflg_j', 'icqflg_h', 'icqflg_k', 'e2mpho_j', 'e2mpho_h', 'e2mpho_k', # 16 bytes 'apasm_b', 'apasm_v', 'apasm_g', 'apasm_r', 'apasm_i', 'apase_b', 'apase_v', 'apase_g', 'apase_r', 'apase_i', 'gcflg', # 16 bytes 'icf', # 4 bytes 'leda', 'x2m', 'rnm', 'zn2', 'rn2'] # 12 bytes """ This is a list of the names of the catalogue star record columns according to the UCAC4 documentation. """ self.cat_formats: List[str] = ['i4', 'i4', 'i2', 'i2', 'i1', 'i1', 'i1', # 15 bytes 'i1', 'i1', 'i1', 'i1', 'i1', # 5 bytes 'i2', 'i2', 'i2', 'i2', 'i1', 'i1', # 10 bytes 'i4', 'i2', 'i2', 'i2', 'i1', 'i1', 'i1', 'i1', 'i1', 'i1', # 16 bytes 'i2', 'i2', 'i2', 'i2', 'i2', 'i1', 'i1', 'i1', 'i1', 'i1', 'i1', # 16 bytes 'i4', # 4 bytes 'i1', 'i1', 'i4', 'i2', 'i4'] # 12 bytes """ This is a list of the types for the raw UCAC4 star record columns according to the UCAC4 documentation The types are numpy dtype strings """ self.cat_dtype: np.dtype = np.dtype({'names': self.cat_names, 'formats': self.cat_formats}) """ This is the numpy datatype that represents a record in the catalogue binary files. This is used when accessing the binary files as memory maps. """ self.hpm_formats = copy.copy(self.cat_formats) """ This is a list of the format strings for the high proper stars. It is the mostly same as the regular table since the stars are included in the main catalogue but need data from an extra table. """ # these are the 2 columns that need a slightly different format new_hpm_format_inds = [14, 15] for ind in new_hpm_format_inds: self.hpm_formats[ind] = 'i8' self.hpm_dtype: np.dtype = np.dtype({'names': self.cat_names, 'formats': self.hpm_formats}) """ This specifies the numpy datatype used to represent the high proper motion stars. """ self.index: Optional[np.ndarray] = None """ This contains the index into the catalogue that is used to efficiently retrieve stars. The index is stored as a numpy structured array. It gives the number of stars for each block in each zone file. """ # make the index self.build_index()
[docs] def query_catalogue(self, ids: Optional[ARRAY_LIKE] = None, min_ra: Real = 0, max_ra: Real = 360, min_dec: Real = -90, max_dec: Real = 90, min_mag: Real = -4, max_mag: Real = 20, search_center: Optional[ARRAY_LIKE] = None, search_radius: Optional[Real] = None, new_epoch: Optional[Union[datetime, Real]] = None) -> pd.DataFrame: """ This method queries stars from the catalogue that meet specified constraints and returns them as a DataFrame with columns of :attr:`.GIANT_COLUMNS`. Stars can either be queried by ID directly or by right ascension/declination/magnitude. You cannot filter using both with this method. If :attr:`apply_proper_motion` is ``True`` then this will shift the stars to the new epoch input by the user (``new_epoch``) using proper motion. :param ids: A sequence of star ids to retrieve from the catalogue. The ids are given by zone, rnz and should be input as an iterable that yields tuples (therefore if you have a dataframe you should do ``df.itertuples(false)`` :param min_ra: The minimum ra bound to query stars from in degrees :param max_ra: The maximum ra bound to query stars from in degrees :param min_dec: The minimum declination to query stars from in degrees :param max_dec: The maximum declination to query stars from in degrees :param min_mag: The minimum magnitude to query stars from. Recall that magnitude is inverse (so lower magnitude is a dimmer star) :param max_mag: The maximum magnitude to query stars from. Recall that magnitude is inverse (so higher magnitude is a dimmer star) :param search_center: The center of a search cone as a ra/dec pair. :param search_radius: The radius about the center of the search cone :param new_epoch: The epoch to translate the stars to using proper motion if :attr:`apply_proper_motion` is turned on :return: A Pandas dataframe with columns :attr:`GIANT_COLUMNS`. """ cat_recs = self.query_catalogue_raw(ids=ids, min_ra=min_ra, max_ra=max_ra, min_dec=min_dec, max_dec=max_dec, min_visual_mag=min_mag, max_visual_mag=max_mag, search_center=search_center, search_radius=search_radius, generator=False) giant_record = self.convert_to_giant_catalogue(cat_recs) giant_record = giant_record[(giant_record.mag <= max_mag) & (giant_record.mag >= min_mag)] if self.include_proper_motion and (new_epoch is not None): apply_proper_motion(giant_record, new_epoch, copy=False) return giant_record
[docs] def query_catalogue_raw(self, ids: Optional[ARRAY_LIKE] = None, min_ra: Real = 0., max_ra: Real = 360., min_dec: Real = -90., max_dec: Real = 90., search_center: Optional[ARRAY_LIKE] = None, search_radius: Optional[Real] = None, max_visual_mag: Real = 20., min_visual_mag: Real = -1.44, generator: bool = False) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]: """ This method queries stars from the catalogue that meet specified constraints and returns them as a DataFrame or as an iterable of dataframes where the columns are the raw catalogue files. Stars can either be queried by ID directly or by right ascension/declination/magnitude. You cannot filter using both with this method. This method is not usable by GIANT and it does not apply proper motion. If you need records that are usable by GIANT and with proper motion applied see :meth:`query_catalogue`. For details on what the columns are refer to the UCAC4 documentation (can be found online). :param ids: A sequence of star ids to retrieve from the catalogue. The ids are given by zone, rnz and should be input as an iterable that yields tuples (therefore if you have a dataframe you should do ``df.itertuples(false)`` :param min_ra: The minimum ra bound to query stars from in degrees :param max_ra: The maximum ra bound to query stars from in degrees :param min_dec: The minimum declination to query stars from in degrees :param max_dec: The maximum declination to query stars from in degrees :param min_visual_mag: The minimum visual magnitude to query stars from. Recall that magnitude is inverse (so lower magnitude is a dimmer star) :param max_visual_mag: The maximum visual magnitude to query stars from. Recall that magnitude is inverse (so higher magnitude is a dimmer star) :param search_center: The center of a search cone as a ra/dec pair. :param search_radius: The radius about the center of the search cone :param generator: A boolean flag specifying whether to return the stars as a generator (preserving memory) or to to collect all of the stars into a single dataframe. If ``True`` then the function will yield dataframes (which may themselves contain single or multiple stars). :return: A (Iterable of) Pandas dataframe(s) with columns according to the catalogue columns. """ if ids is not None: out = self.get_from_ids(ids) else: out = self._get_all_with_criteria(min_ra=min_ra, max_ra=max_ra, min_dec=min_dec, max_dec=max_dec, search_center=search_center, search_radius=search_radius, max_visual_mag=max_visual_mag, min_visual_mag=min_visual_mag) if generator is False: results = [] for res in out: results.append(res) if results: return pd.concat(results) else: # noinspection SpellCheckingInspection return pd.DataFrame(columns=self.cat_names + ['zone', 'rnz', 'parallax', 'sigpara']) else: return out
[docs] def get_from_ids(self, ids: ARRAY_LIKE) -> Iterable[pd.DataFrame]: """ This is a generator which returns single records for each requested ID. ``ids`` should be iterable with each element being length 2 and returning zone, rnz. The yields will be the raw catalogue data. :param ids: A sequence of star ids to retrieve from the catalogue. The ids are given by zone, rnz and should be input as an iterable that yields tuples (therefore if you have a dataframe you should do ``df.itertuples(false)`` :return: An Iterable of Pandas dataframes with columns according to the catalogue columns. """ for zone, rnz in ids: offset = (rnz - 1) * self.bytes_per_rec records = pd.DataFrame.from_records(np.fromfile(self._zone_files[zone - 1], dtype=self.cat_dtype, offset=offset, count=1).astype(self.hpm_dtype), index='rnm') # append new columns records = records.assign(zone=zone, rnz=rnz, parallax=0., sigpara=0.) yield records
def _get_all_with_criteria(self, min_ra: Real = 0., max_ra: Real = 360., min_dec: Real = -90., max_dec: Real = 90., search_center: Optional[ARRAY_LIKE] = None, search_radius: Optional[Real] = None, max_visual_mag: Real = 20., min_visual_mag: Real = -1.44) -> Iterable[pd.DataFrame]: """ This function gets all stars meeting the criteria from the catalogue, yielding the results as DataFrames by zone. In general, the user should not interact with this method and instead should use :meth:`query_catalogue_raw`. :param min_ra: The minimum ra bound to query stars from in degrees :param max_ra: The maximum ra bound to query stars from in degrees :param min_dec: The minimum declination to query stars from in degrees :param max_dec: The maximum declination to query stars from in degrees :param min_visual_mag: The minimum visual magnitude to query stars from. Recall that magnitude is inverse (so lower magnitude is a dimmer star) :param max_visual_mag: The maximum visual magnitude to query stars from. Recall that magnitude is inverse (so higher magnitude is a dimmer star) :param search_center: The center of a search cone as a ra/dec pair. :param search_radius: The radius about the center of the search cone :return: An Iterable of Pandas dataframes with columns according to the catalogue columns. """ if search_center is not None: # determine which zone files we need to check # find the bounds based on the specified min ra/dec and the search center stuff min_ra = float(max(search_center[0] - search_radius, min_ra)) max_ra = float(min(search_center[0] + search_radius, max_ra)) min_dec = float(max(search_center[1] - search_radius, min_dec)) max_dec = float(min(search_center[1] + search_radius, max_dec)) if min_dec < -90.0: # if the search region includes the south pole take all RA min_dec = -90 min_ra = 0 max_ra = 360 elif max_dec > 90.0: # if the search region includes the north pole take all RA max_dec = 90 min_ra = 0 max_ra = 360 if min_ra < 0: # if the first point of ares is included then we need to query both the high and low RA min_ra += 360 # check the upper bounds zone_start_a, block_start_a = self.get_zone_block(min_ra, min_dec) # check the lower bounds zone_start_b, block_start_b = self.get_zone_block(0, min_dec) # determine the limiting case zone_start = min(zone_start_a, zone_start_b) block_start = min(block_start_a, block_start_b) # check the upper bounds zone_end_a, block_end_a = self.get_zone_block(360, max_dec) # check the lower bounds zone_end_b, block_end_b = self.get_zone_block(max_ra, max_dec) # determine the limiting case zone_end = max(zone_end_a, zone_end_b) block_end = max(block_end_a, block_end_b) elif max_ra > 360: # if the first point of ares is included then we need to query both the high and low RA max_ra -= 360 # check the upper bounds zone_start_a, block_start_a = self.get_zone_block(min_ra, min_dec) # check the lower bounds zone_start_b, block_start_b = self.get_zone_block(0, min_dec) # determine the limiting case zone_start = min(zone_start_a, zone_start_b) block_start = min(block_start_a, block_start_b) # check the upper bounds zone_end_a, block_end_a = self.get_zone_block(360, max_dec) # check the lower bounds zone_end_b, block_end_b = self.get_zone_block(max_ra, max_dec) # determine the limiting case zone_end = max(zone_end_a, zone_end_b) block_end = max(block_end_a, block_end_b) else: # Nothing special needs to happen because we are in the ra domain zone_start, block_start = self.get_zone_block(min_ra, min_dec) zone_end, block_end = self.get_zone_block(max_ra, max_dec) else: # if we don't have to deal with the search radius stuff then just use what the user gave us zone_start, block_start = self.get_zone_block(float(min_ra), float(min_dec)) zone_end, block_end = self.get_zone_block(float(max_ra), float(max_dec)) # walk through each zone that we need to search for zone in np.arange(zone_start, zone_end + 1): # Get the beginning and ending record number in the current file start, _ = self.index[self.get_index_ind(zone, block_start)] end_start, end_num = self.index[self.get_index_ind(zone, block_end)] number_of_stars = end_start + end_num - start # Get the starting byte for the first record we need from this file and seek to it start_byte = self.bytes_per_rec * start # read in the correct rows. First access as a memmap to conserve memory while we further limit the # results # noinspection PyTypeChecker data = np.memmap(self._zone_files[zone - 1], dtype=self.cat_dtype, mode='r', offset=start_byte, shape=(number_of_stars,)) # begin to further limit the results if search_center is not None: bearing_check = radec_distance(data['ra'] * MAS2RAD, data['spd'] * MAS2RAD - np.pi / 2, search_center[0] * DEG2RAD, search_center[1] * DEG2RAD) <= (search_radius * DEG2RAD) else: bearing_check = (data['ra'] >= min_ra * DEG2MAS) & (data['ra'] <= max_ra * DEG2MAS) & \ (data['spd'] >= (min_dec + 90) * DEG2MAS) & (data['spd'] <= (max_dec + 90) * DEG2MAS) # limit the magnitude based on the APASS V magnitude if available. # If it is not then default to the ucac magnitude but that is crappy.. mag_check = ( (data['apasm_v'] <= max_visual_mag * 1000) & (data['apasm_v'] >= min_visual_mag * 1000) ) | ( (data['magm'] <= max_visual_mag * 1000) & (data['magm'] >= min_visual_mag * 1000) ) valid_data = bearing_check & mag_check if not valid_data.any(): continue records = pd.DataFrame.from_records(np.array(data[valid_data]).astype(self.hpm_dtype), index='rnm') # append new columns records = records.assign(zone=zone, rnz=np.where(valid_data)[0] + 1 + start, parallax=0., sigpara=0.) # check to see if hpm stars were encountered # noinspection SpellCheckingInspection hpm_check = (records["pmra"] == 32767) if hpm_check.any(): hpm_records = records[hpm_check].copy() for rnm in hpm_records.index: # should probably just store this in memory instead of doing a binary hpm_line = binary_search(self._hpm_file, rnm).decode().split() # noinspection SpellCheckingInspection hpm_records.loc[rnm, 'pmra'] = int(hpm_line[3]) # noinspection SpellCheckingInspection hpm_records.loc[rnm, 'pmdc'] = int(hpm_line[4]) records[hpm_check] = hpm_records # check to see if any hipparcos stars are included hip_flag = records['icf'].values // (10 ** 8) hip_check = (hip_flag == 7) | (hip_flag == 9) if hip_check.any(): running_numbers = records[hip_check].index # get the information about each hipparcos star and add it to the data for rnm in running_numbers: hip_rec = binary_search(self._hippo_file, rnm).decode().split() records.loc[rnm, 'parallax'] = float(hip_rec[8]) # noinspection SpellCheckingInspection records.loc[rnm, 'sigpara'] = float(hip_rec[13]) yield records
[docs] def build_index(self): """ This method builds the in memory index into the catalogue files. The index is stored in the :attr:`index` attribute as a numpy array. It specifies the number of stars in each block in each zone to make it easy to seek to the right place. """ # noinspection SpellCheckingInspection self.index = np.genfromtxt(self._index_file, dtype=[('start', np.uint32), ('nstars', np.uint16)], usecols=[0, 1])
[docs] @staticmethod def get_zone_block(ra: Real, dec: Real) -> Tuple[int, int]: """ This tells you the zone and block number that correspond to a specific ra dec pair :param ra: the right ascension under consideration in units of degrees :param dec: The declination under consideration in units of degrees :return: a tuple containing the zone and block location for the ra/dec """ # zones are based on dec zone = int(np.floor((dec + 90) / 0.2) + 1) # blocks are based on ra ra_ind = int(np.floor(ra / 0.25) + 1) # need this check because ra is < 360, not <= if ra_ind > 1440: ra_ind = 1440 # need this check for when the max dec is 90, because this is included in the last file if zone > 900: zone = 900 return zone, ra_ind
[docs] @staticmethod def get_index_ind(zone: int, ra_ind: int) -> int: """ This determines the location in the index that corresponds to the requested zone and block :param zone: The zone number :param ra_ind: The right ascension index :return: """ return (zone - 1) * 1440 + ra_ind - 1
[docs] @staticmethod def convert_to_giant_catalogue(ucac_records: pd.DataFrame) -> pd.DataFrame: """ This method converts records in the catalogue format into records in the GIANT format. This is done by renaming columns and converting units. :param ucac_records: The raw records from the catalogue as a pandas DataFrame :return: The GIANT records as a Pandas DataFrame """ # prep the ucac data frame (set the full index) ucac_records = ucac_records.assign(source='UCAC4').reset_index().rename(columns={'index': 'rnm'}) ucac_records = ucac_records.set_index(['source', 'zone', 'rnz', 'rnm']) records = ucac_records.loc[:, _UCAC_COLS] records.rename(columns=_UCAC_TO_GIANT, inplace=True) records = records.assign(epoch=2000.0) records.dtypes.loc[GIANT_COLUMNS] = GIANT_TYPES # replace invalid magnitudes with the ucac model magnitude invalid_mag = records['mag'] == 20000 records.loc[invalid_mag, 'mag'] = ucac_records.loc[invalid_mag, 'magm'].astype(np.float64) # convert to giant units records['ra'] /= DEG2MAS # MAS to DEG records['dec'] /= DEG2MAS # MAS to DEG records['dec'] -= 90. # SPD to DEC records['distance_sigma'] /= records['distance'] ** 2 # convert parallax std to distance std records['distance'] /= 1000 # MAS to arcsecond records['distance'] **= -1 # parallax to distance (arcsecond to parsec) records['distance'] *= PARSEC2KM # parsec to kilometers records['mag'] /= 1000. # mMAG to MAG records['ra_sigma'] += 128 # to uint records['ra_sigma'] /= DEG2MAS # to deg records['dec_sigma'] += 128 # to uint records['dec_sigma'] /= DEG2MAS # to deg records['ra_proper_motion'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR records['dec_proper_motion'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR records['ra_pm_sigma'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR records['dec_pm_sigma'] /= 10 * DEG2MAS # 0.1 MAS/YR to DEG/YR records['distance_sigma'] *= 1000 # 1/MAS to parsec records['distance_sigma'] *= PARSEC2KM # parsec to km # convert the sigmas to J2000 # noinspection SpellCheckingInspection ra_shift_time = 2000 - (ucac_records['cepra'] / 100 + 1900) # noinspection SpellCheckingInspection dec_shift_time = 2000 - (ucac_records['cepdc'] / 100 + 1900) records['ra_sigma'] = np.sqrt(records['ra_sigma'] ** 2 + ra_shift_time ** 2 * records['ra_pm_sigma'] ** 2) records['dec_sigma'] = np.sqrt(records['dec_sigma'] ** 2 + dec_shift_time ** 2 * records['dec_pm_sigma'] ** 2) # fix for stars with no parallax -- The distance standard deviation seems wrong for these default_distance_error = 20 / (STAR_DIST / PARSEC2KM / 1000) ** 2 * PARSEC2KM * 1000 records['distance_sigma'].fillna(value=default_distance_error, inplace=True) records['distance'].replace([np.inf, -np.inf], STAR_DIST, inplace=True) # fix for stars where the parallax is invalid records.loc[records.distance < 0, 'distance'] = STAR_DIST records.loc[records.distance < 0, 'distance_sigma'] = default_distance_error # specify that the epoch of the stars is J2000 return records
[docs] def cross_ref_tycho(self, ucac_labels: ARRAY_LIKE, tycho_cat: Optional[Tycho2] = None) -> pd.DataFrame: """ This retrieves the Tycho 2 catalogue records for the requested UCAC4 star Zone, RNZ values. :param tycho_cat: The tycho catalogue instance to use. If None, a default instance will be created :param ucac_labels: The UCAC 4 labels as an iterable of Zone, RNZ pairs :return: The raw Tycho 2 star records. """ if tycho_cat is None: tycho_cat = Tycho2() out = [] # loop through all of the UCAC stars for label in ucac_labels: # compose the UCAC id as included in the u4xtycho file query = label[0] * 1e6 + label[1] # search the u4xtycho file to see if this corresponds to a tycho star line = binary_search(self._tycho_cross_file, query, column=1) if line is not None: line = line.decode() # determine the tycho2 id from the u4xtycho file t2id = ' '.join([line[:4], line[5:10], line[11]]) # get the tycho2 star record rec = tycho_cat.retrieve_record(t2id) out.append(rec) else: warnings.warn('The requested UCAC4 star does not have a tycho reference associated with it: {}'.format( label )) # todo: may need to figure out how to get a unique id here so it doesn't get overwritten out.append(tycho_cat.nan_frame()) return pd.concat(out)
[docs] def dump_to_sqlite(self, database_connection: Connection, limiting_mag: Real = 20, use_tycho_mag: bool = False, return_locations: bool = False, return_mag: Optional[Real] = None) -> Optional[pd.DataFrame]: """ Use this to write the catalogue to a sqlite3 database in the GIANT format. You can control what stars/data are included using the key word argument inputs. You can also have this return the star magnitude/locations for doing blending stars. In general you should not use this directly. Instead you should use :func:`~.giant_catalogue.build_catalogue` or script :mod:`~.scripts.build_catalogue`. :param database_connection: The connection to the database the data is to be dumped to :param limiting_mag: The maximum magnitude to include in the catalogue. This is based off of the APASM_V magnitude or the UCAC4 magm magnitude, depending on which is available :param use_tycho_mag: This flag stores the magnitude from the Tycho2 catalogue for each star that is in the Tycho2 catalogue. Note that this will be very slow. :param return_locations: This flag specifies to return locations for stars for doing blending :param return_mag: This flag specifies to only return locations for stars that are brighter than this magnitude. If ``None`` then all stars are returned. :return: A dataframe of the dumped stars that meet the ``return_mag`` condition or ``None`` if ``return_locations`` is ``False`` """ from .tycho import Tycho2 # if we want to use the tycho information in place of UCAC4 then create the tycho catalogue interface if use_tycho_mag: tycho = Tycho2() else: tycho = None # list for returning the results if we are doing that out = [] start = time.time() print('dumping zone {}'.format(1), flush=True) # loop through each zone file and dump it for ind, records in enumerate(self.query_catalogue_raw(max_visual_mag=limiting_mag, generator=True)): # convert into the GIANT format giant_records = self.convert_to_giant_catalogue(records) # if we are cross referencing the tycho catalogue do it if use_tycho_mag: tycho_recs = self.cross_ref_tycho(giant_records.index.droplevel(['source', 'rnm']).tolist(), tycho_cat=tycho) # add a column so we can see where the tycho information came from giant_records.loc[:, 'tycho id'] = '' for star_index, magnitude_index in enumerate(giant_records.index): if not np.isnan(tycho_recs.iloc[star_index].VTmag): # update the magnitude if we found a tycho record giant_records.loc[magnitude_index, 'tycho id'] = '{}-{}-{}'.format( *tycho_recs.iloc[star_index].name ) giant_records.loc[magnitude_index, 'mag'] = tycho_recs.iloc[star_index].VTmag # set the index to be the rnm giant_records = giant_records.reset_index().set_index('rnm') # dump it out to the GIANT catalogue in the stars table giant_records.to_sql('stars', database_connection, if_exists='append') print('zone dumped in {:.3f} secs'.format(time.time() - start), flush=True) if return_locations: if return_mag is not None: out.append(giant_records.loc[giant_records.mag <= return_mag, ["ra", "dec", "mag"]]) else: out.append(giant_records.loc[:, ["ra", "dec", "mag"]]) zone = ind + 2 start = time.time() print('dumping zone {}'.format(zone), flush=True) if return_locations: return pd.concat(out)
[docs]class ColumnOrder(Enum): """ This enumeration specifies whether a column is sorted in ascending or descending order. This is intended to be used as an input to :func:`binary_search`. """ ASCENDING = "ASCENDING" """ The column is sorted in ascending order (smallest first) """ DESCENDING = "DESCENDING" """ The column is sorted in DESCENDING order (smallest last) """
[docs]def check_file(file: Path, md5s: Optional[dict] = None) -> Tuple[bool, Optional[dict]]: """ This checks the md5sum of a file to ensure it wasn't corrupted. .. warning:: This function can only verify the integrity of the files through md5 sums which are insecure. While the vizier service is trusted, use this function at your own risk :param file: The file to check the md5 of :param md5s: A dictionary containing the md5 sums for each file :return: a flag specifying if the file was correct and the dictionary of md5 sums used to check the file """ import hashlib if md5s is None: md5_file = file.parent / "md5sum.txt" # type: Path if md5_file.exists(): md5s = {} with md5_file.open('r') as m_file_obj: for line in m_file_obj: if line.startswith('#'): continue else: fmd5, m_file_obj = line.split() md5s[m_file_obj.replace('*', '')] = fmd5 else: return False, None if file.exists(): base = file.name if base != "md5sum.txt": # MD5 has a security risk but its better than nothing... return md5s.get(base, None) == hashlib.md5(file.open('rb').read()).hexdigest(), md5s # nosec else: return True, md5s else: return False, md5s
[docs]def download_ucac(target_directory: Path): """ This function downloads the UCAC4 catalogue from vizier to the target directory. This is done over ftp. It requires an active internet connection that can connect to cdsarc.u-strasbg.fr .. warning:: This download will take a long time and use up approximately 9 GB of space. .. warning:: This download can only verify the integrity of the files through md5 sums which are insecure. While the vizier service is trusted, use this function at your own risk :param target_directory: the directory to save the UCAC catalogue to """ # we minimize the security risk here by using FTPS import ftplib # nosec # FTPS is secure ftp = ftplib.FTP_TLS('cdsarc.u-strasbg.fr') # nosec # anonymous login since we're just grabbing data ftp.connect() ftp.sendcmd('USER anonymous') ftp.sendcmd('PASS anonymous@a.com') ftp.cwd('pub/cats/I/322A/UCAC4/') lines = [] ftp.retrlines('LIST', callback=lines.append) for line in lines: if line.startswith('d'): # directory name = line.split()[-1] if name in ["u4b", "u4i"]: local_directory = target_directory / name local_directory.mkdir(parents=True, exist_ok=True) download_lines = [] ftp.retrlines('LIST {}'.format(name), callback=download_lines.append) md5s = None for dl in download_lines: start = time.time() file = dl.split()[-1] local = local_directory / file downloaded, md5s = check_file(local, md5s) if not downloaded: with local.open('wb') as download_file: # noinspection SpellCheckingInspection ftp.retrbinary('RETR {}/{}'.format(name, file), download_file.write) if name == 'u4b': print('{} of z900 done in {:.3f}'.format(file, time.time()-start), flush=True) else: print('{} done in {:.3f}'.format(file, time.time()-start), flush=True) else: print('{} already downloaded'.format(file))