Source code for TXHousing.analysis.parcel_graphs

"""Graphs which mostly rely on municipal parcel data. Note that all of these graphs rely on the cached parcel data in
 csv format, not the actual parcel data, which is processed and cached in the data_processing package."""

import numpy as np
import pandas as pd
from ..data_processing import parcel
from plotnine import *
from functools import reduce

[docs]def plot_singlefamily_lotsizes(save_path = 'Figures/Zoning/sf_lotsizes.svg', width = 10, height = 8): """ Plots average lotsizes of single family homes in Houston conditional on distance from city center. This uses cached municipal parcel data. """ # Read data austin_path = parcel.get_cached_municipal_parcel_path('austin') austin_parcels = pd.read_csv(austin_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place']] austin_parcels = austin_parcels.loc[(austin_parcels['place'] == 'Austin') & (austin_parcels['broad_zone'] == 'Single Family')] dallas_path = parcel.get_cached_municipal_parcel_path('dallas') dallas_parcels = pd.read_csv(dallas_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place']] dallas_parcels = dallas_parcels.loc[(dallas_parcels['place'] == 'Dallas') & (dallas_parcels['broad_zone'] == 'Single Family')] houston_path = parcel.get_cached_municipal_parcel_path('houston') houston_parcels = pd.read_csv(houston_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place']] houston_parcels = houston_parcels.loc[(houston_parcels['place'] == 'Houston') & (houston_parcels['broad_zone'] == 'Single Family')] # Combine, smooth dist_to_center all_parcels = pd.concat([austin_parcels, dallas_parcels, houston_parcels], axis = 0, ignore_index = True) all_parcels['dist_to_center'] = all_parcels['dist_to_center'].astype(float).apply(np.ceil) all_parcels['dist_to_center'] = all_parcels['dist_to_center'].apply(lambda x: x if x <= 10 else '10+') # Calculate final graphed result result = all_parcels.groupby(['place', 'dist_to_center', 'broad_zone'])['area_sqft'].mean() result = result.unstack().reset_index() print(result.columns) sflotplot = (ggplot(result, aes(x = 'dist_to_center', y = 'Single Family', fill = 'place')) + geom_col(position = 'dodge', width = 0.7) + labs(x = 'Distance from Center of City (Miles', y = 'Average Lot Size (Square Feet)', title = 'Average Lot Sizes by Distance from City Center, in Austin, Dallas, and Houston', caption = 'Based on Parcel Data provided by Austin, Dallas, and Harris County.') + theme_bw()) sflotplot.save(save_path, width = width, height = height)
[docs]def plot_percent_undeveloped(save_path = 'Figures/Zoning/percent_undeveloped.svg'): """ Calculates the percent of land which is undeveloped (using base_area features and area calculations) conditional on distance from city center and broad_zone. Based on cached municipal parcel data.""" # Some buildings/parcels span multiple zones, making their % undeveloped appear to be negative. We just substitute # these negative values with a value of 0, because most of these buildings are in very dense areas with extremely # low setback requirements anyway. This is a limitation on the accuracy of the results, of course. def fill_negatives_with_zeroes(a_series): a_series[a_series < 0] = 0 return a_series # Read data and calculate percent undeveloped as well as the number of developed square feet austin_path = parcel.get_cached_municipal_parcel_path('austin') austin_parcels = pd.read_csv(austin_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place', 'far']] austin_parcels = austin_parcels.loc[(austin_parcels['place'] == 'Austin')] austin_parcels['percent_undeveloped'] = 1 - austin_parcels['far'] austin_parcels['developed_sqft'] = np.minimum(austin_parcels['area_sqft'], austin_parcels['far'].multiply(austin_parcels['area_sqft'])) dallas_path = parcel.get_cached_municipal_parcel_path('dallas') dallas_parcels = pd.read_csv(dallas_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place', 'TOT_MAIN_SF']] dallas_parcels = dallas_parcels.loc[(dallas_parcels['place'] == 'Dallas')] dallas_parcels['percent_undeveloped'] = 1 - dallas_parcels['TOT_MAIN_SF'].divide(dallas_parcels['area_sqft']) dallas_parcels['developed_sqft'] = np.minimum(dallas_parcels['TOT_MAIN_SF'], dallas_parcels['area_sqft']) houston_path = parcel.get_cached_municipal_parcel_path('houston') houston_parcels = pd.read_csv(houston_path)[['dist_to_center', 'area_sqft', 'broad_zone', 'place', 'BASE_AREA']] houston_parcels = houston_parcels.loc[(houston_parcels['place'] == 'Houston')] houston_parcels['percent_undeveloped'] = 1 - houston_parcels['BASE_AREA'].divide(houston_parcels['area_sqft']) houston_parcels['developed_sqft'] = np.minimum(houston_parcels['BASE_AREA'], houston_parcels['area_sqft']) # Group all_data = pd.concat([austin_parcels, dallas_parcels, houston_parcels], axis = 0, ignore_index = True) all_data['dist_to_center'] = all_data['dist_to_center'].astype(float).apply(np.ceil) grouped_data = all_data.groupby(['place', 'dist_to_center', 'broad_zone']) # Calculate medians, simple & weighted mean medians = grouped_data['percent_undeveloped'].median().reset_index() medians['percent_undeveloped'] = 100*medians['percent_undeveloped'] medians = medians.rename(columns={'percent_undeveloped': 'percent_undeveloped_median'}) means = grouped_data['percent_undeveloped'].mean().reset_index() means['percent_undeveloped'] = 100*means['percent_undeveloped'] means = means.rename(columns = {'percent_undeveloped': 'percent_undeveloped_simple_mean'}) total_areas = grouped_data['area_sqft'].sum() total_developed_area = grouped_data['developed_sqft'].sum(skipna = True) weighted_means = total_developed_area.divide(total_areas).reset_index() weighted_means[0] = 100*weighted_means[0] weighted_means = weighted_means.rename(columns = {0: 'percent_undeveloped_weighted_average'}) # Combine and save as csv all_results = reduce(lambda left, right: pd.merge(left, right, on=['broad_zone', 'dist_to_center', 'place'], how='outer', sort=False), [weighted_means, means, medians]) all_results.to_csv('shared_data/calculations/percent_undeveloped.csv') # Now graph all_results = all_results.loc[all_results['broad_zone'] == 'Single Family'] all_results = all_results.loc[all_results['dist_to_center'] < 11] p = (ggplot(all_results, aes(x='dist_to_center', y='percent_undeveloped_simple_mean', fill='place', group='place')) + geom_col(position='dodge') + labs(title='Undeveloped Area of Single Family Lots in Texas Triangle', x='Distance from City Center', y='Simple Mean of Undeveloped Percentage of SF Lots')) p.save(save_path, width=10, height=8)
[docs]def calc_parking_costs(save_path = 'Figures/property_value_histogram.svg'): """Calculates average land costs within 1 mile of the city center in Austin, Dallas, Houston. Relies on cached municipal parcel data. These are a bit conservative figures because they use lot size instead of base area of the actual building.""" # Read data, get value austin_path = parcel.get_cached_municipal_parcel_path('austin') austin_parcels = pd.read_csv(austin_path)[['dist_to_center', 'area_sqft', 'place', 'appraised']] austin_parcels['value'] = austin_parcels['appraised'].astype(float) austin_parcels = austin_parcels.loc[ (austin_parcels['place'] == 'Austin') & (austin_parcels['dist_to_center'] <= 1)] dallas_path = parcel.get_cached_municipal_parcel_path('dallas') dallas_parcels = pd.read_csv(dallas_path)[['dist_to_center', 'area_sqft', 'place', 'TOT_VAL']] dallas_parcels['value'] = dallas_parcels['TOT_VAL'].astype(float) dallas_parcels = dallas_parcels.loc[ (dallas_parcels['place'] == 'Dallas') & (dallas_parcels['dist_to_center'] <= 1)] houston_path = parcel.get_cached_municipal_parcel_path('houston') houston_parcels = pd.read_csv(houston_path)[['dist_to_center', 'area_sqft', 'place', 'TOTAL_APPRAISED_VALUE']] houston_parcels['value'] = houston_parcels['TOTAL_APPRAISED_VALUE'].astype(float) houston_parcels = houston_parcels.loc[ (houston_parcels['place'] == 'Houston') & (houston_parcels['dist_to_center'] <= 1)] # Join, calculate value_per_sqft, group all_data = pd.concat([austin_parcels, dallas_parcels, houston_parcels], axis=0) all_data['value_per_sqft'] = all_data['value'].divide(all_data['area_sqft']) grouped_data = all_data.groupby(['place']) # Calculate median = grouped_data['value_per_sqft'].median().reset_index() median = median.rename(columns={'value_per_sqft': 'value_per_sqft_median'}) simple_mean = grouped_data['value_per_sqft'].mean().reset_index() simple_mean = simple_mean.rename(columns={'value_per_sqft': 'value_per_sqft_simple_mean'}) weighted_mean = grouped_data['value'].sum().divide(grouped_data['area_sqft'].sum()).reset_index() weighted_mean = weighted_mean.rename(columns={0: 'value_per_sqft_mean'}) all_results = reduce(lambda left, right: pd.merge(left, right, on=['place'], how='outer', sort=False), [weighted_mean, simple_mean, median]) all_results.to_csv('shared_data/calculations/values_per_sqft.csv') # Graph histogram -- # Get rid of the outliers maximum = all_data['value_per_sqft'].quantile(.95) all_data.loc[all_data['value_per_sqft'] > maximum, 'value_per_sqft'] = maximum all_data = all_data.loc[(all_data['place'].notnull()) & (all_data['value_per_sqft'].notnull())] p = (ggplot(all_data, aes(x = 'value_per_sqft', fill = 'place', group = 'place')) + geom_histogram() + labs(title = 'Property Values within 1 Mile of City Center in Texas Triangle', x = 'Value per Square Foot', y = 'Number of Parcels') + theme_bw() + facet_wrap('~place')) p.save(save_path, width = 10, height = 8)