Source code for TXHousing.analysis.parcel_graphs

"""Graphs which mostly rely on municipal parcel data. Note that all of these graphs rely on the cached parcel data in
 csv format, not the actual parcel data, which is processed and cached in the data_processing package."""

import numpy as np
import pandas as pd
from ..data_processing import parcel
from plotnine import *
from functools import reduce

[docs]def plot_singlefamily_lotsizes(save_path = 'Figures/Zoning/sf_lotsizes.svg', width = 10, height = 8):
    """ Plots average lotsizes of single family homes in Houston conditional on distance from city center. This uses
    cached municipal parcel data. """

    # Read data
    austin_path = parcel.get_cached_municipal_parcel_path('austin')
    austin_parcels = pd.read_csv(austin_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place']]
    austin_parcels = austin_parcels.loc[(austin_parcels['place'] == 'Austin')
                                        & (austin_parcels['broad_zone'] == 'Single Family')]

    dallas_path = parcel.get_cached_municipal_parcel_path('dallas')
    dallas_parcels = pd.read_csv(dallas_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place']]
    dallas_parcels = dallas_parcels.loc[(dallas_parcels['place'] == 'Dallas')
                                        & (dallas_parcels['broad_zone'] == 'Single Family')]

    houston_path = parcel.get_cached_municipal_parcel_path('houston')
    houston_parcels = pd.read_csv(houston_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place']]
    houston_parcels = houston_parcels.loc[(houston_parcels['place'] == 'Houston')
                                          & (houston_parcels['broad_zone'] == 'Single Family')]

    # Combine, smooth dist_to_center
    all_parcels = pd.concat([austin_parcels, dallas_parcels, houston_parcels], axis = 0, ignore_index = True)
    all_parcels['dist_to_center'] = all_parcels['dist_to_center'].astype(float).apply(np.ceil)
    all_parcels['dist_to_center'] = all_parcels['dist_to_center'].apply(lambda x: x if x <= 10 else '10+')

    # Calculate final graphed result
    result = all_parcels.groupby(['place', 'dist_to_center', 'broad_zone'])['area_sqft'].mean()
    result = result.unstack().reset_index()
    print(result.columns)
    sflotplot = (ggplot(result, aes(x = 'dist_to_center', y = 'Single Family', fill = 'place'))
                    + geom_col(position = 'dodge', width = 0.7)
                    + labs(x = 'Distance from Center of City (Miles', y = 'Average Lot Size (Square Feet)',
                           title = 'Average Lot Sizes by Distance from City Center, in Austin, Dallas, and Houston',
                           caption = 'Based on Parcel Data provided by Austin, Dallas, and Harris County.')
                    + theme_bw())
    sflotplot.save(save_path, width = width, height = height)



[docs]def plot_percent_undeveloped(save_path = 'Figures/Zoning/percent_undeveloped.svg'):
    """ Calculates the percent of land which is undeveloped (using base_area features and area calculations) conditional
    on distance from city center and broad_zone. Based on cached municipal parcel data."""


    # Some buildings/parcels span multiple zones, making their % undeveloped appear to be negative. We just substitute
    # these negative values with a value of 0, because most of these buildings are in very dense areas with extremely
    # low setback requirements anyway. This is a limitation on the accuracy of the results, of course. 
    def fill_negatives_with_zeroes(a_series):
        a_series[a_series < 0] = 0
        return a_series

    # Read data and calculate percent undeveloped as well as the number of developed square feet
    austin_path = parcel.get_cached_municipal_parcel_path('austin')
    austin_parcels = pd.read_csv(austin_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place', 'far']]
    austin_parcels = austin_parcels.loc[(austin_parcels['place'] == 'Austin')]
    austin_parcels['percent_undeveloped'] = 1 - austin_parcels['far']
    austin_parcels['developed_sqft'] = np.minimum(austin_parcels['area_sqft'], 
                                                  austin_parcels['far'].multiply(austin_parcels['area_sqft']))

    dallas_path = parcel.get_cached_municipal_parcel_path('dallas')
    dallas_parcels = pd.read_csv(dallas_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place', 'TOT_MAIN_SF']]
    dallas_parcels = dallas_parcels.loc[(dallas_parcels['place'] == 'Dallas')]
    dallas_parcels['percent_undeveloped'] = 1 - dallas_parcels['TOT_MAIN_SF'].divide(dallas_parcels['area_sqft'])
    dallas_parcels['developed_sqft'] = np.minimum(dallas_parcels['TOT_MAIN_SF'], dallas_parcels['area_sqft'])

    houston_path = parcel.get_cached_municipal_parcel_path('houston')
    houston_parcels = pd.read_csv(houston_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place', 'BASE_AREA']]
    houston_parcels = houston_parcels.loc[(houston_parcels['place'] == 'Houston')]
    houston_parcels['percent_undeveloped'] = 1 - houston_parcels['BASE_AREA'].divide(houston_parcels['area_sqft'])
    houston_parcels['developed_sqft'] = np.minimum(houston_parcels['BASE_AREA'], houston_parcels['area_sqft'])

    # Group
    all_data = pd.concat([austin_parcels, dallas_parcels, houston_parcels], axis = 0, ignore_index = True)
    all_data['dist_to_center'] = all_data['dist_to_center'].astype(float).apply(np.ceil)
    grouped_data = all_data.groupby(['place', 'dist_to_center', 'broad_zone'])

    # Calculate medians, simple & weighted mean
    medians = grouped_data['percent_undeveloped'].median().reset_index()
    medians['percent_undeveloped'] = 100*medians['percent_undeveloped']
    medians = medians.rename(columns={'percent_undeveloped': 'percent_undeveloped_median'})

    means = grouped_data['percent_undeveloped'].mean().reset_index()
    means['percent_undeveloped'] = 100*means['percent_undeveloped']
    means = means.rename(columns = {'percent_undeveloped': 'percent_undeveloped_simple_mean'})

    total_areas = grouped_data['area_sqft'].sum()
    total_developed_area = grouped_data['developed_sqft'].sum(skipna = True)
    weighted_means = total_developed_area.divide(total_areas).reset_index()
    weighted_means[0] = 100*weighted_means[0]
    weighted_means = weighted_means.rename(columns = {0: 'percent_undeveloped_weighted_average'})

    # Combine and save as csv
    all_results = reduce(lambda left, right: pd.merge(left, right, 
                                                     on=['broad_zone', 'dist_to_center', 'place'], 
                                                     how='outer', 
                                                     sort=False),
                        [weighted_means, means, medians])
    all_results.to_csv('shared_data/calculations/percent_undeveloped.csv')

        # Now graph
    all_results = all_results.loc[all_results['broad_zone'] == 'Single Family']
    all_results = all_results.loc[all_results['dist_to_center'] < 11]
    p = (ggplot(all_results,
                aes(x='dist_to_center', y='percent_undeveloped_simple_mean', fill='place', group='place'))
         + geom_col(position='dodge')
         + labs(title='Undeveloped Area of Single Family Lots in Texas Triangle',
                x='Distance from City Center',
                y='Simple Mean of Undeveloped Percentage of SF Lots'))
    p.save(save_path, width=10, height=8)

[docs]def calc_parking_costs(save_path = 'Figures/property_value_histogram.svg'):
    """Calculates average land costs within 1 mile of the city center in Austin, Dallas, Houston. Relies on cached
    municipal parcel data. These are a bit conservative figures because they use lot size instead of base area of the
    actual building."""

    # Read data, get value
    austin_path = parcel.get_cached_municipal_parcel_path('austin')
    austin_parcels = pd.read_csv(austin_path)[['dist_to_center', 'area_sqft', 'place', 'appraised']]
    austin_parcels['value'] = austin_parcels['appraised'].astype(float)
    austin_parcels = austin_parcels.loc[
        (austin_parcels['place'] == 'Austin') & (austin_parcels['dist_to_center'] <= 1)]

    dallas_path = parcel.get_cached_municipal_parcel_path('dallas')
    dallas_parcels = pd.read_csv(dallas_path)[['dist_to_center', 'area_sqft', 'place', 'TOT_VAL']]
    dallas_parcels['value'] = dallas_parcels['TOT_VAL'].astype(float)
    dallas_parcels = dallas_parcels.loc[
        (dallas_parcels['place'] == 'Dallas') & (dallas_parcels['dist_to_center'] <= 1)]

    houston_path = parcel.get_cached_municipal_parcel_path('houston')
    houston_parcels = pd.read_csv(houston_path)[['dist_to_center', 'area_sqft', 'place', 'TOTAL_APPRAISED_VALUE']]
    houston_parcels['value'] = houston_parcels['TOTAL_APPRAISED_VALUE'].astype(float)
    houston_parcels = houston_parcels.loc[
        (houston_parcels['place'] == 'Houston') & (houston_parcels['dist_to_center'] <= 1)]

    # Join, calculate value_per_sqft, group
    all_data = pd.concat([austin_parcels, dallas_parcels, houston_parcels], axis=0)
    all_data['value_per_sqft'] = all_data['value'].divide(all_data['area_sqft'])
    grouped_data = all_data.groupby(['place'])

    # Calculate
    median = grouped_data['value_per_sqft'].median().reset_index()
    median = median.rename(columns={'value_per_sqft': 'value_per_sqft_median'})

    simple_mean = grouped_data['value_per_sqft'].mean().reset_index()
    simple_mean = simple_mean.rename(columns={'value_per_sqft': 'value_per_sqft_simple_mean'})

    weighted_mean = grouped_data['value'].sum().divide(grouped_data['area_sqft'].sum()).reset_index()
    weighted_mean = weighted_mean.rename(columns={0: 'value_per_sqft_mean'})

    all_results = reduce(lambda left, right: pd.merge(left, right, on=['place'], how='outer', sort=False),
                         [weighted_mean, simple_mean, median])
    all_results.to_csv('shared_data/calculations/values_per_sqft.csv')

    # Graph histogram --
    # Get rid of the outliers
    maximum = all_data['value_per_sqft'].quantile(.95)
    all_data.loc[all_data['value_per_sqft'] > maximum, 'value_per_sqft'] = maximum
    all_data = all_data.loc[(all_data['place'].notnull()) & (all_data['value_per_sqft'].notnull())]

    p = (ggplot(all_data, aes(x = 'value_per_sqft', fill = 'place', group = 'place'))
          + geom_histogram()
          + labs(title = 'Property Values within 1 Mile of City Center in Texas Triangle',
                 x = 'Value per Square Foot',
                 y = 'Number of Parcels')
          + theme_bw()
          + facet_wrap('~place'))
    p.save(save_path, width = 10, height = 8)