Source code for cenpy.tools

import pandas as pd
import os
import warnings as warn
import time
from .explorer import fips_table as _ft
from requests import HTTPError
try:
    from tqdm import tqdm
except ImportError:
    def tqdm(arg, **kwargs):
        return arg
_state_fipscodes = _ft('state')['FIPS Code']
_state_fipscodes = [str(f).rjust(2, '0') for f in _state_fipscodes if f < 60] 

def national_to_block(cxn, *columns, wait_by_state=0, 
                                     wait_by_county=0):
    """
    A helper function to grab all blocks by iterating over state fips codes in cenpy.explorer.fips_table. 
    This just naively calls state_to_block for each state, so will end up executing quite a few queries. 
    You may be rate limited if you don't use an APIKEY

    Parameters
    ---------
    cxn     :   cenpy.base.Connection object
                    the connection to use to query.
    columns :   comma separated collection
                    splatted comma separated collection of column names
                    to grab from the connection. A call may be like:
                    >>> tools.national_to_block(cxn, *cxn.varslike('H001*"))
    wait_by_state : callable or int
                    if an integer, gives the number of seconds passed 
                    to sys.sleep between each state query. if callable,
                    will be called each state to get a sleep time. This
                    means you can use random waittimes. So, to wait
                    for an exponentially-distributed number of seconds:
                    >>> import random
                    >>> tools.national_to_block(cxn, 
                                                *cxn.varslike("P001*"), 
                                                wait_by_state = lambda : (random.expovariate(2) * 20))
    wait_by_county: callable or int
                    wait time (or wait time callable) applied between each county-level query. See wait_by_state for more information. If this value tends to be large, the time it takes to conduct the query can get large quickly. 
    """
    if isinstance(wait_by_state, (int, float)):
        waitfunc = lambda : wait_by_state
    else:
        waitfunc = wait_by_state
    outs = []
    for fp in tqdm(_state_fipscodes):
        print(fp)
        try:
            outs.append(state_to_block(fp, cxn, *columns, 
                                       wait=wait_by_county))
        except HTTPError:
            warn.warn('Something failed in state {}, terminating prematurely'.format(fp))
            raise
        time.sleep(waitfunc())
    return pd.concat(outs)

def national_to_tract(cxn, *columns, wait_by_state = 0,
                                     wait_by_county = 0):
    """
    A helper function to grab all tracts by iterating over state fips codes in cenpy.explorer.fips_table. 
    This just naively calls state_to_tract for each state, so will end up executing quite a few queries. 
    You may be rate limited if you don't use an APIKEY
    """
    if isinstance(wait_by_state, (int, float)):
        waitfunc = lambda : wait_by_state
    else:
        waitfunc = wait_by_state
    outs = []
    for fp in _state_fipscodes:
        try:
            outs.append(state_to_tract(fp, cxn, *columns, 
                                       wait=wait_by_county))
        except HTTPError:
            warn.warn('Something failed in state {}, terminating prematurely'.format(fp))
            raise
        time.sleep(waitfunc())
    return pd.concat(outs)

def national_to_blockgroup(cxn, *columns, wait_by_state=0, wait_by_county=0):
    """
    A helper function to grab all blockgroups by iterating over state fips codes in cenpy.explorer.fips_table. 
    This just naively calls state_to_blockgroup for each state, so will end up executing quite a few queries. 
    You may be rate limited if you don't use an APIKEY
    """
    if isinstance(wait_by_state, (int, float)):
        wait_by_state = lambda : wait_by_state
    else:
        waitfunc = wait_by_state
    outs = []
    for fp in _state_fipscodes:
        try:
            outs.append(state_to_blockgroup(fp, cxn, *columns, wait=wait_by_county))
        except HTTPError:
            warn.warn('Something failed in state {}, terminating prematurely'.format(fp))
            raise
        time.sleep(waitfunc(a))
    return pd.concat(outs)

def state_to_block(stfips, cxn, *columns, wait=0):
    """
    Casts the generator constructed by genstate_to_block to a full dataframe. 
    For arguments, see genstate_to_block
    """
    if isinstance(wait, (int, float)):
        waitfunc = lambda : wait
    else:
        waitfunc = wait
    out = []
    for cblock in genstate_to_block(stfips, cxn, *columns):
        out.append(cblock)
        time.sleep(waitfunc())
    return pd.concat(out)

def state_to_blockgroup(stfips, cxn, *columns, wait=0):
    """
    Casts the generator constructed by genstate_to_blockgroup to a full dataframe. 
    For arguments, see genstate_to_blockgroup
    """
    if isinstance(wait, (int, float)):
        waitfunc = lambda : wait
    else:
        waitfunc = wait
    out = []
    for cblock in genstate_to_blockgroup(stfips, cxn, *columns):
        out.append(cblock)
        time.sleep(waitfunc())
    return pd.concat(out)

def state_to_tract(stfips, cxn, *columns, wait=0):
    """
    Casts the generator constructed by genstate_to_tract to a full dataframe. 
    Parameters
    ---------
    stfips  :   str
                two-digit state fips code used to grab tracts
    cxn     :   cenpy.base.Connection
                connection object against which the query occurs
    *columns:   comma separated collection
                splatted list of columns to use in the query:
                >>> state_to_tract('06', cxn, 'H0010001', 'P001002')
                >>> #is equivalent to:
                >>> state_to_tract('06', cxn, *['H0010001', 'P001002'])
    wait    :   int or callable
                a number of seconds to wait before the next county
                is queried. Must be an integer or a function that 
                takes no arguments and returns a number of seconds 
                to wait. 
                
    For arguments, see genstate_to_tract
    """
    if isinstance(wait, (int, float)):
        waitfunc = lambda : wait
    else:
        waitfunc = wait
    out = []
    for cblock in genstate_to_tract(stfips, cxn, *columns):
        out.append(cblock)
        time.sleep(waitfunc())
    return pd.concat(out)

def county_to_block(stfips, ctfips, cxn, *columns, wait=0):
    """
    Returns all blocks underneath a county

    Parameters
    ---------
    stfips : int or str describing the state's fips code
    ctfips : int or str describing the county's fips code
    *columns: splatted list of columns to grab from the API

    Returns
    -------
    dataframe containing the entries of columns for each block. 
    """
    if isinstance(wait, (int, float)):
        waitfunc = lambda : wait
    else:
        waitfunc = wait
    out = []
    for cblock in gencounty_to_block(stfips, ctfips, cxn, *columns):
        out.append(cblock)
        time.sleep(waitfunc())
    return pd.concat(out)

def genstate_to_block(stfips, cxn, *columns):
    """
    Generator to handle geo-in-geo queries without the user having to worry about wrangling the counties. 

    Parameters
    ----------
    stfips  :   str
                fips of the state 
    cxn     :   cenpy.base.Connection
                connection instance
    *columns:   str
                columns that are desired by the user to grab for each block. 
    Returns
    -------
    a Generator that yields dataframes.
    """
    counties = cxn.query(['NAME'], geo_unit='county', geo_filter={'state':stfips})
    counties = counties.county.tolist()
    ctracts = ((county, tract) for county in counties 
                for tract in cxn.query(['NAME'], geo_unit='tract', 
                                       geo_filter={'state':stfips, 'county':county}).tract)
    for county,tract in tqdm(ctracts):
        blocks = cxn.query(['NAME'] + list(columns), geo_unit='block', 
                           geo_filter={'state':stfips, 
                                       'county':county,
                                       'tract':tract})
        yield blocks

def gencounty_to_block(stfips, ctfips, cxn, *columns):
    """
    Generator to handle geo-in-geo queries without the user having to worry about wrangling the tracts
    within a county.

    Parameters
    ---------
    stfips : str
             fips of the state
    ctfips : str
             fips of the county
    cxn    : connection to use
    *columns: splatted list of the columns to query, all strings

    Returns
    -------
    a Generator that yields dataframes
    """
    start_filter = dict(state=str(stfips).rjust(2, '0'), county=str(ctfips).rjust(3, '0'))
    tracts = cxn.query(['NAME'], geo_unit='tract', geo_filter=start_filter)
    for tract in tqdm(tracts.tract):
        start_filter.update(dict(tract=tract))
        blocks = cxn.query(['NAME'] + list(columns), geo_unit = 'block', geo_filter = start_filter)
        yield blocks

def genstate_to_blockgroup(stfips, cxn, *columns):
    """
    Generator to handle geo-in-geo queries without the user having to worry about wrangling the counties. 

    Parameters
    ----------
    stfips  :   str
                fips of the state 
    cxn     :   cenpy.base.Connection
                connection instance
    *columns:   str
                columns that are desired by the user to grab for each blockgroup. 
    Returns
    -------
    a Generator that yields dataframes.
    """
    counties = cxn.query(['NAME'], geo_unit='county', geo_filter={'state':stfips})
    counties = counties.county.tolist()
    ctracts = ((county, tract) for county in counties 
                for tract in cxn.query(['NAME'], geo_unit='tract', 
                                       geo_filter={'state':stfips, 'county':county}).tract)
    for county,tract in tqdm(ctracts):
        blockgroups = cxn.query(['NAME'] + list(columns), geo_unit='blockgroup', 
                           geo_filter={'state':stfips, 
                                       'county':county,
                                       'tract':tract})
        yield blockgroups

def genstate_to_tract(stfips, cxn, *columns):
    """
    Generator to handle geo-in-geo queries without the user having to worry about wrangling the counties. 

    Parameters
    ----------
    stfips  :   str
                fips of the state 
    cxn     :   cenpy.base.Connection
                connection instance
    *columns:   str
                columns that are desired by the user to grab for each tract. 
    Returns
    -------
    a Generator that yields dataframes.
    """
    counties = cxn.query(['NAME'], geo_unit='county', geo_filter={'state':stfips})
    counties = counties.county.tolist()
    for county in tqdm(counties):
        tract = cxn.query(['NAME'] + list(columns), geo_unit='tract', 
                           geo_filter={'state':stfips, 
                                       'county':county})
        yield tract

[docs]def set_sitekey(sitekey, overwrite=False):
    """
    Save the sitekey so that users can access it via cenpy.SITEKEY. 
    This lets users bind an API key to a given installation. 

    Parameters
    -----------
    sitekey     :   str
                    str containing the census data api key the user wants to bind
    overwrite   :   bool
                    flag denoting whether to overwrite existing sitekey. Defaults to False. 

    Returns
    --------
    path to the location of the sitekey file or raises FileError if overwriting is prohibited and the file exists.  
    """
    thispath = os.path.dirname(os.path.abspath(__file__))
    targetpath = os.path.join(thispath, 'SITEKEY.txt')
    if os.path.isfile(targetpath):
        if not overwrite:
            raise FileError('SITEKEY already bound and overwrite flag is not set')
    with open(targetpath, 'w') as outfile:
        outfile.write(sitekey)
    return targetpath

def _load_sitekey():
    basepath = os.path.dirname(os.path.abspath(__file__))
    targetpath = os.path.join(basepath, 'SITEKEY.txt')
    try:
        with open(targetpath, 'r') as f:
            s = f.read()
        return s.strip()
    except FileNotFoundError:
        return None