Source code for cenpy.tiger

from six import iteritems as diter
import requests as r
import pandas as pd
try:
    from geopandas import GeoDataFrame
except (ImportError, OSError):
    raise ImportError('Geopandas is required to do spatial operations, and'
                          ' must be installed in order to use the cenpy product API.'
                          ' For directions on how to install geopandas, consult'
                          ' https://geopandas.org/install.html. Ensure that all'
                          ' of the dependencies, including rtree, are installed.'
                          ' Normally, installing geopandas through the '
                          ' Anaconda Software Distribution (https://repo.continuum.io)'
                          ' in the "conda-forge" software channel will work.')
import copy

from . import geoparser as gpsr

# all queries to a map server, mounted at
# tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/
# are mounted by adding <name>/<MapServer> if they're mapservers

# none of the types at that url?f=json are not Mapservers.

_baseurl = "http://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb"
_pcs = "https://developers.arcgis.com/javascript/jshelp/pcs.html"
_bcs = "https://developers.arcgis.com/javascript/jshelp/bcs.html"

_basequery = {'where': '',  # sql query component
              'text': '',  # raw text search
              'objectIds': '',  # only grab these objects
              'time': '',  # time instant/time extend to query
              'geometry': '',  # spatial filter to apply to query
              'geometryType': 'esriGeometryEnvelope',  # spatial support
              'inSR': '',  # spatial ref of input geometry
              'spatialRel': '',  # what to do in a DE9IM spatial query
              'relationParam': '',  # used if arbitrary spatialRel is applied
              'outFields': '*',  # fields to pass from the header out
              'returnGeometry': True,  # bool describing whether to pass geometry out
              'maxAllowableOffset': '',  # set a spatial offset
              'geometryPrecision': '',
              'outSR': '',  # spatial reference of returned geometry
              'returnIdsOnly': False,  # bool stating to only return ObjectIDs
              'returnCountOnly': False,  # not documented, probably for the sql query
              'orderByFields': '',  # again not documented, probably for the sql
              'groupByFieldsForStatistics': '',  # not documented, probably for sql
              'outStatistics': '',  # no clue
              'returnZ': False,  # whether to return z components of shp-z
              'returnM': False,  # whether to return m components of shp-m
              'gdbVersion': '',  # geodatabase version name
              'returnDistinctValues': ''}  # no clue


def _jget(st):
    return r.get(st + '?f=json')


def available(verbose=False):
    """
    Query the TIGERweb geoAPI for available MapServices

    Parameters
    -----------
    verbose :   int or bool
                indicator for the verbosity level. Accepts levels -1, 0, 1, and greater.

    Returns
    -------
    list or dict of available MapServers through TIGERweb
    """
    q = _jget(_baseurl)
    q.raise_for_status()
    q = q.json()
    for d in q['services']:
        d['name'] = d['name'].split('/')[-1]
    if verbose == -1:
        return [d['name'] for d in q['services']]
    if not verbose:
        return q['services']
    else:
        print('verbose may take a bit...')
        nexturls = ['/'.join([_baseurl, d['name'], d['type']])
                    for d in q['services']]
        for i, d in enumerate(q['services']):
            resp = _jget(nexturls[i])
            resp.raise_for_status()
            d['description'] = resp.json()['description']
        if verbose == True:
            return q['services']
        else:
            return q


[docs]class ESRILayer(object):
    """The fundamental building block to access a single Geography/Layer in an ESRI MapService"""
[docs]    def __init__(self, baseurl, **kwargs):
        """
        Class representing the ESRI Layer in the TIGER API

        Parameters
        ----------
        baseurl :   str
                    the url for the Layer. 

        """
        self.__dict__.update({'_'+k: v for k, v in diter(kwargs)})
        if hasattr(self, '_fields'):
            self.variables = pd.DataFrame(self._fields)
        self._baseurl = baseurl + '/' + str(self._id)

    def __repr__(self):
        try:
            return '(ESRILayer) ' + self._name
        except:
            return ''

[docs]    def query(self, raw=False, strict=False, **kwargs):
        """
        A query function to extract data out of MapServer layers. I've exposed
        every option here 

        Parameters
        ---------- 
        where: str, required
                    sql query string. 
        out_fields: list or str
                    fields to pass from the header out (default: '*')
        return_geometry: bool
                    bool describing whether to return geometry or just the
                    dataframe. (default: True)
        geometry_precision: str
                    a number of significant digits to which the output of the
                    query should be truncated (default: None)
        out_sr: int or str
                    ESRI WKID spatial reference into which to reproject 
                    the geodata (default: None)
        return_ids_only: bool
                    bool stating to only return ObjectIDs. (default: False)
        return_z: bool
                     whether to return z components of shp-z, (default: False)
        return_m: bool
                     whether to return m components of shp-m, (default: False)
        strict  :   bool
                    whether to throw an error if invalid polygons are provided from the API (True)
                    or just warn that at least one polygon is invalid (default: False)
        raw : bool
              whether to provide the raw geometries from the API  (default: False)
        
        Returns
        ------- 
        Dataframe or GeoDataFrame containing entries from the geodatabase

        Notes
        -----
        Most of the time, this should be used leaning on the SQL "where"
        argument: 

        cxn.query(where='GEOID LIKE "06*"')

        In most cases, you'll be querying against layers, not MapServices
        overall. 
        """
    # parse args
        kwargs = {''.join(k.split('_')): v for k, v in diter(kwargs)}

    # construct query string
        self._basequery = copy.deepcopy(_basequery)
        for k, v in diter(kwargs):
            try:
                self._basequery[k] = v
            except KeyError:
                raise KeyError("Option '{k}' not recognized, check parameters")
        qstring = '&'.join(['{}={}'.format(k, v)
                            for k, v in diter(self._basequery)])
        self._last_query = self._baseurl + '/query?' + qstring
    # run query
        resp = r.get(self._last_query + '&f=json')
        resp.raise_for_status()
        datadict = resp.json()
        if raw:
            return datadict
        if kwargs.get('returnGeometry', 'true') is 'false':
            return pd.DataFrame.from_records([x['attributes'] for x in datadict['features']])
    # convert to output format
        try:
            features = datadict['features']
        except KeyError:
            code, msg = datadict['error']['code'], datadict['error']['message']
            details = datadict['error']['details']
            if details is []:
                details = 'Mapserver provided no detailed error'
            raise KeyError((r'Response from API is malformed. You may have '
                            r'submitted too many queries, formatted the request incorrectly, '
                            r'or experienced significant network connectivity issues.'
                            r' Check to make sure that your inputs, like placenames, are spelled'
                            r' correctly, and that your geographies match the level at which you'
                            r' intend to query. The original error from the Census is:\n'
                            r'(API ERROR {}:{}({}))'.format(code, msg, details)))
        todf = []
        for i, feature in enumerate(features):
            locfeat = gpsr.__dict__[datadict['geometryType']](feature)
            todf.append(locfeat['properties'])
            todf[i].update({'geometry': locfeat['geometry']})
        df = pd.DataFrame(todf)
        outdf = gpsr.convert_geometries(df, strict=strict)
        outdf = GeoDataFrame(outdf)
        crs = datadict.pop('spatialReference', None)
        if crs is not None:
            crs = crs.get('latestWkid', crs.get('wkid'))
            crs = dict(init='epsg:{}'.format(crs))
        outdf.crs = crs
        return outdf


[docs]class TigerConnection(object):
    """The fundamental building block for US Census Bureau's Geographic, an ESRI MapService"""

[docs]    def __init__(self, name=None):
        """
        Parameters
        ----------
        name    :   str
                    string describing the API to connect to

        """
        if name not in available(verbose=-1):
            raise KeyError(
                'Dataset {n} not found. Please check cenpy.tiger.available()'.format(n=name))
        else:
            self._baseurl = '/'.join([_baseurl, name, 'MapServer'])
            resp = _jget(self._baseurl)
            resp.raise_for_status()
            resp = resp.json()
            self._key = name
            self.title = resp.pop('mapName', name)
            self.layers = self._get_layers()
            self.copyright = resp['copyrightText']
            self.projection = resp['spatialReference']['latestWkid']

    def _get_layers(self):
        resp = _jget(self._baseurl + '/layers')
        resp.raise_for_status()
        resp = resp.json()
        return [ESRILayer(self._baseurl, **d) for d in resp['layers']]

[docs]    def query(self, **kwargs):
        """
        method to query the ESRI API. Passes down to an appropriately-chosen layer. 
        """
        layer_result = kwargs.pop('layer', None)
        if isinstance(layer_result, str):
            from .products import _fuzzy_match
            layer_result = _fuzzy_match(layer_result, 
                                        [f.__repr__() for f in self.layers]).index
        if layer_result is None:
            raise Exception('No layer selected.')
        return self.layers[layer_result].query(**kwargs)