!pip install geopandas

import pandas as pd
import numpy as np
import re

import geopandas
import plotnine
from plotnine import *

import matplotlib.pyplot as plt
import matplotlib.tri as tri
from mpl_toolkits.axes_grid1 import make_axes_locatable

from sklearn.metrics import confusion_matrix

%matplotlib inline

Covid19 Weekly Data from the 'Comunidad de Madrid'

https://datos.comunidad.madrid/catalogo/dataset/covid19_tia_zonas_basicas_salud

Covid 19-TIA Zonas Básicas de Salud (CSV)

On a Tuesday morning, weekly data are provided for the 286 basic health areas:

confirmed cases and rate of accumulated incidence for basic health areas
the source of data is the Madrid region's network of epidemiological vigilence
the data are continually updated; you should always show the date when the data were accessed
the polygon data includes the population figures from the region's statistical institute from the population register of 1 January 2019.

"Información epidemiológica Covid-19: Casos confirmados y tasa de incidencia acumulada (TIA) por Zonas Básicas de Salud (Decreto 52/2010, de 29 de julio, BOCM 9 de agosto de 2010). Informe diario o semanal con datos de casos confirmados, casos confirmados con infección activa y tasas de incidencia acumulada de los últimos 14 días y desde el inicio de la epidemia (25 de febrero de 2020). La fuente de los datos es la Red de Vigilancia Epidemiológica de la Comunidad de Madrid. Cuando se utilicen los datos se debe indicar en qué fecha se ha accedido a los mismos, dada su actualización continua. Hasta el 1 de julio de 2020 los registros de datos correspondientes a cada fecha de informe se han añadido diariamente. A partir del 2 de julio de 2020 la actualización pasa a ser semanal. A fecha 1 de junio, se ha procedido a actualizar el conjunto de datos de Zonas Básicas de Salud, con los datos actualizados de geometrías proporcionados por el Instituto de Estadística de la Comunidad de Madrid. Las nuevas geometrías incluyen los datos poblacionales actualizados por el Instituto de Estadística de la Comunidad de Madrid, relativos al Padrón Continuo a 1 de enero de 2019."

Here we will read the CSV data dated 29 September 2020 and explore:

zones_df=pd.read_csv('covid19_tia_zonas_basicas_salud_s.csv', delimiter=';', encoding='latin')

print ('Columns:')
[print(col) for col in list(zones_df.columns)]
print()
print('Shape: ',zones_df.shape)
print()
print('Dates: ',list(zones_df.fecha_informe.unique()))

Columns:
zona_basica_salud
fecha_informe
casos_confirmados_activos_ultimos_14dias
tasa_incidencia_acumulada_activos_ultimos_14dias
casos_confirmados_ultimos_14dias
tasa_incidencia_acumulada_ultimos_14dias
casos_confirmados_totales
tasa_incidencia_acumulada_total
codigo_geometria

Shape:  (5434, 9)

Dates:  ['2020/09/29 11:15:00', '2020/09/22 09:42:00', '2020/09/15 10:23:00', '2020/09/08 11:03:00', '2020/09/01 09:33:00', '2020/08/25 09:33:00', '2020/08/18 09:32:00', '2020/08/11 09:31:00', '2020/08/04 09:58:00', '2020/07/28 09:34:00', '2020/07/21 09:23:00', '2020/07/14 09:23:00', '2020/07/07 09:32:00', '2020/06/30 07:00:00', '2020/06/23 07:00:00', '2020/06/16 07:00:00', '2020/06/09 07:00:00', '2020/06/02 07:00:00', '2020/05/26 07:00:00']

Tidy-up the dataframe by

replacing decimal , with decimal .
shortening key column names
dropping time from the date column

for col in ['tasa_incidencia_acumulada_activos_ultimos_14dias', 'tasa_incidencia_acumulada_ultimos_14dias', 'tasa_incidencia_acumulada_total']:
    zones_df[col] = pd.to_numeric(zones_df[col].apply(lambda x: re.sub(',', '.', x)))
zones_df.rename(columns={'zona_basica_salud':'zona_basic','tasa_incidencia_acumulada_ultimos_14dias':'tasa', 'casos_confirmados_ultimos_14dias':'casos'}, inplace=True)
zones_df.fecha_informe=zones_df.fecha_informe.apply(lambda x: x[5:10])

Data for the centre of Madrid (where the main office of the regional authority are located).

zones_df[zones_df.zona_basic=='Cortes'].T

fig, ax = plt.subplots(1,1,figsize=(10,5))
plt.plot(zones_df[zones_df.zona_basic=='Cortes'].fecha_informe[::-1], zones_df[zones_df.zona_basic=='Cortes'].casos[::-1])
plt.title('Cases confirmed in the last 14 days - Cortes');

fig, ax = plt.subplots(1,1,figsize=(10,5))
plt.plot(zones_df[zones_df.zona_basic=='Cortes'].fecha_informe[::-1], zones_df[zones_df.zona_basic=='Cortes'].tasa[::-1])
plt.ylim(0,1000)
plt.title('Rate of accumulated incidence for 100 000 people in the last 14 days - Cortes');

print(len(zones_df.zona_basic.unique()))
print(zones_df.zona_basic.unique()[:10])

286
['Abrantes' 'Acacias' 'Adelfas' 'Alameda' 'Alameda de Osuna'
 'Alcalde Bartolomé González' 'Alcalá de Guadaira' 'Alcobendas - Chopera'
 'Alcocer' 'Algete']

Allocate to bins based on rate

zones_df.tasa.fillna(0, inplace=True)
cut_labels=['<200','200-400','400-600','600-800','800-1000','>1000']
cut_bins = [-1., 200., 400., 600., 800., 1000., max(zones_df.tasa)]
zones_df['tasa_bin'] = pd.cut(zones_df.tasa, bins=cut_bins, labels=cut_labels)
zones_df.tasa_bin.value_counts()

<200        3667
200-400      654
400-600      475
600-800      301
>1000        175
800-1000     162
Name: tasa_bin, dtype: int64

Helper function to check for errors in the presentation of the names, e.g. spaces, 'de',...

def check_zones(zonas_restricted):
  lst=[]
  for zona_basic in zones_df.zona_basic[:286]:
    if zona_basic in zonas_restricted: lst.append(zona_basic)
  print(len(lst), set(zonas_restricted)-set(lst))

Identify the initial 37 areas restricted from 21 September 2020:

zonas_restricted_1=["Puerta Bonita","Vista Alegre","Guayaba", "Almendrales", "Las Calesas", "Zofío", "Orcasur", "San Fermín", "San Andrés", "San Cristóbal", 
                  "El Espinillo", "Los Rosales", "Villa Vallecas", "Entrevías", "Martínez de la Riva", "San Diego",
                  "Numancia", "Peña Prieta", "Pozo de Tío Raimundo", "Ángela Uriarte", "Alcalá de Guadaira", "Federica Montseny", "Doctor Cirajas", 
                  "Gandhi", "Daroca", "La Elipa", "Alicante", "Cuzco", "Francia", "Humanes de Madrid", "San Blas", "Isabel II",
                  "Las Margaritas", "Sánchez Morate", "Reyes Católicos", "Alcobendas - Chopera", "Miraflores"]

check_zones(zonas_restricted_1)

37 set()

Identify the additional 8 areas restricted from 28 September 2020:

zonas_restricted_2=["Panaderas", "Doctor Trueta", "Miguel Servet", "Campo de la Paloma", "Rafael Alberti", "García Noblejas", "Vicálvaro - Artilleros","Orcasitas"]

check_zones(zonas_restricted_2)

8 set()

Label zones with restricted group

zones_df['restricted']='0'
for i in range(len(zonas_restricted_1)): zones_df.loc[zones_df.zona_basic==zonas_restricted_1[i], 'restricted']='1'
for i in range(len(zonas_restricted_2)): zones_df.loc[zones_df.zona_basic==zonas_restricted_2[i], 'restricted']='2'
zones_df.restricted.value_counts()/len(zones_df.fecha_informe.unique())

0    241.0
1     37.0
2      8.0
Name: restricted, dtype: float64

Helper function to reorder the presentation of the dates

def col_func(fecha): return fecha[3]+fecha[4]+fecha[2]+fecha[0]+fecha[1]

Histograms of number of zones in each category in the last 6 weeks

plotnine.options.figure_size = (14, 8)

ggplot(zones_df[zones_df.fecha_informe>'08/24'], aes(x='tasa_bin', fill='restricted')) \
+ geom_histogram(binwidth=1, alpha=0.6, position='stack') \
+ facet_wrap('fecha_informe', labeller=labeller(cols=col_func)) \
+ theme_minimal() \
+ labs(title="Zonas básicas de salud",
       x='Tasa incidencia acumulada ultimos 14 días',
       y="Número de zonas") \
+ scale_fill_brewer(palette = 'Reds')

/usr/local/lib/python3.6/dist-packages/plotnine/utils.py:1246: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if pdtypes.is_categorical(arr):

<ggplot: (8761571081273)>

Maps for the most recent week

week_df=zones_df[zones_df.fecha_informe==zones_df.fecha_informe.unique().max()][['zona_basic','tasa','casos','tasa_bin','restricted']]
df=(geopandas.read_file('./maps/zonas_basicas_salud.shp')).merge(week_df)
df['geometry_pt']=df.geometry.centroid
df.head().T

Maps of rate of accumulated instance and restricted zones

fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(12,6))

df.plot(ax=ax1, column=df.tasa_bin, cmap='Reds', legend=True)
df.plot(ax=ax1, color='white', edgecolor='grey', alpha=0.1)
ax1.set_title('Rate of accumulated incidence')
ax1.axis('off')

df.plot(ax=ax2, column=df.restricted, cmap='Reds', legend=True)
df.plot(ax=ax2, color='white', edgecolor='grey', alpha=0.1)
ax2.set_title('Restricted zones')
ax2.axis('off');

df.tasa_bin.value_counts()

600-800     89
400-600     78
>1000       49
800-1000    49
200-400     21
<200         0
Name: tasa_bin, dtype: int64

df.restricted.value_counts()

0    241
1     37
2      8
Name: restricted, dtype: int64

Treating the rate as a surface and plotting contours

Linear interpolation of the rate at the centroid of each zone

def getXY(pt): return (pt.x, pt.y)

def data_for_week(fecha):
  week_df=zones_df[zones_df.fecha_informe==fecha][['zona_basic','tasa','casos','tasa_bin','restricted']]
  df=geopandas.read_file('./maps/zonas_basicas_salud.shp').merge(week_df)
  x, y = [list(t) for t in zip(*map(getXY, df.geometry.centroid))]
  z = df.tasa
  return x,y,z

def multi_plot(i):

    fecha=weeks[i]
    x,y,z = data_for_week(fecha)

    fig, ax = plt.subplots(1,1,figsize=(10,8))
    ax.set(xlim=(360000, 500000), ylim=(4420000, 4550000))
    
    cntr = ax.tricontourf(x, y, z, [0,200,400,800,1000,2000], cmap="Reds") # plot contour fill
    df.plot(color='white', edgecolor='grey', alpha=0.1, ax=ax)

    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.0)

    fig.colorbar(cntr, drawedges=False, cax=cax)
    ax.set_title(f'Tasa incidencia acumulada en los últimos 14 días\n fecha {fecha[3]+fecha[4]+fecha[2]+fecha[0]+fecha[1]}, maximum {str(int(max(z)))}/100000 habitantes')
    ax.axis('off')

weeks = zones_df.fecha_informe.unique()[:8][::-1] 
for i in range(len(weeks)): multi_plot(i)

Plots for 4 weeks

def plot_4_weeks(plot_weeks):
  fig, axs = plt.subplots(1, 4, figsize=(16, 4))
  for i in range(4):
    fecha=plot_weeks[i]
    x,y,z = data_for_week(plot_weeks[i])
    df.plot(color='white', edgecolor='grey', alpha=0.1, ax=axs[i])
    axs[i].set_title(f'\nfecha {fecha[3]+fecha[4]+fecha[2]+fecha[0]+fecha[1]}\nmax. {str(int(max(z)))}/100000 hab.', fontsize=8)
    axs[i].axis('off')
    cntr=axs[i].tricontourf(x,y,z,[0,200,400,800,1000,2000], cmap="Reds")

  divider = make_axes_locatable(axs[-1])
  cax = divider.append_axes("right", size="3%", pad=0.0)
  fig.colorbar(cntr, drawedges=False, cax=cax)
  fig.suptitle('Tasa incidencia acumulada en los últimos 14 días', fontsize=12, va='baseline')
  plt.savefig('four_weeks.png', bbox_inches='tight');

plot_4_weeks(weeks[:4])

plot_4_weeks(weeks[4:])

Confusion matrix for rate > 1000 and restrictions

week_df=zones_df[zones_df.fecha_informe==zones_df.fecha_informe.max()][['zona_basic','tasa','casos','tasa_bin','restricted']]
df=(geopandas.read_file('./maps/zonas_basicas_salud.shp')).merge(week_df)
print(confusion_matrix((df.restricted != '0')*1, (df.tasa>1000)*1))

[[228  13]
 [  9  36]]

228 zones with rate less than 1000 and no restrictions.
36 zones with rate over 1000 and restrictions.

13 zones with rate over 1000 but no restrictions

df[(df.restricted == '0') & (df.tasa>1000)][['zona_basic','tasa']].sort_values('tasa', ascending=False)

9 zones with rate under 1000 but restricted

df[(df.restricted != '0') & (df.tasa<1000)][['zona_basic','tasa']].sort_values('tasa')

Weak relationship between population density and rate

df['area_sq_km'] = df.geometry.area/1_000_000
df['pob_densidad_sq_km'] = df.pob_pad19/df.area_sq_km
df[['tasa','pob_densidad_sq_km']].corr()

ggplot(df, aes(x='pob_densidad_sq_km',y='tasa',color='restricted')) \
  + geom_point() \
  + geom_smooth(aes(x='pob_densidad_sq_km',y='tasa'), color='black',method = "lm")

/usr/local/lib/python3.6/dist-packages/plotnine/utils.py:1246: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if pdtypes.is_categorical(arr):

<ggplot: (8761570866352)>

Comparison of individual zones to the best and worst

df_week=zones_df[zones_df.fecha_informe==zones_df.fecha_informe.max()]
most = df_week[df_week.tasa==df_week.tasa.max()].zona_basic.to_list()[0]
least = df_week[df_week.tasa==df_week.tasa.min()].zona_basic.to_list()[0]
most, least

('Alicante', 'El Pardo')

zona = 'Cortes'

df_tmp = zones_df.pivot_table(values='tasa', index='fecha_informe', columns='zona_basic').reset_index()
zbs = [most, zona, least]
title= zbs[1] + ' compared to least (' + zbs[2] + ') and most (' + zbs[0] + ')'
ggplot(df_tmp, aes(x='fecha_informe',y=zbs[1],group=1)) \
  + geom_point(colour='blue', alpha = 0.5) \
  + geom_line(aes(y=zbs[1]), colour='blue') \
  + geom_smooth(aes(y=zbs[0]), colour='red' ) \
  + geom_smooth(aes(y=zbs[2]), colour='green') \
  + labs(title=title,
         x='Fecha informe',
         y="Tasa") \
  + geom_text(data=df_tmp[df_tmp.fecha_informe=='09/22'],label=zbs[1], color='blue', hjust= 'right', vjust='bottom') \
  + theme_minimal()

/usr/local/lib/python3.6/dist-packages/plotnine/utils.py:1246: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if pdtypes.is_categorical(arr):
/usr/local/lib/python3.6/dist-packages/plotnine/stats/smoothers.py:168: PlotnineWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  "for lowess smoothings.", PlotnineWarning)
/usr/local/lib/python3.6/dist-packages/plotnine/stats/smoothers.py:168: PlotnineWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  "for lowess smoothings.", PlotnineWarning)

<ggplot: (8761578846640)>

	60	346	632	918	1204	1490	1776	2062	2348	2651	2920	3206	3492	3778	4064	4350	4636	4922	5208
zona_basic	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes	Cortes
fecha_informe	09/29	09/22	09/15	09/08	09/01	08/25	08/18	08/11	08/04	07/28	07/21	07/14	07/07	06/30	06/23	06/16	06/09	06/02	05/26
casos_confirmados_activos_ultimos_14dias	49	67	18	17	14	17	33	15	19	10	NaN	NaN	NaN	NaN	NaN	7	NaN	NaN	NaN
tasa_incidencia_acumulada_activos_ultimos_14dias	185.94	254.25	68.31	64.51	53.13	64.51	125.23	56.92	72.1	37.95	18.97	15.18	3.79	3.79	18.97	26.56	15.18	3.79	3.79
casos	203	188	188	169	147	89	64	49	27	19	7	NaN	NaN	NaN	8	10	10	7	NaN
tasa	770.34	713.42	713.42	641.32	557.83	337.74	242.87	185.94	102.46	72.1	26.56	18.97	7.59	11.38	30.36	37.95	37.95	26.56	15.18
casos_confirmados_totales	939	820	713	613	522	429	365	332	293	281	264	261	251	246	245	242	236	231	226
tasa_incidencia_acumulada_total	3563.3	3111.72	2705.68	2326.2	1980.87	1627.96	1385.09	1259.87	1111.87	1066.33	1001.82	990.44	952.49	933.52	929.72	918.34	895.57	876.59	857.62
codigo_geometria	061	061	061	061	061	061	061	061	061		061	061	061	061	061	061	061	061	061

	0	1	2	3	4
codigo_geo	001	002	003	004	005
pob_pad19	30748	19432	29168	21274	29001
zona_basic	Abrantes	Acacias	Adelfas	Alameda	Alameda de Osuna
geometry	POLYGON ((439068.7580000004 4470731.660800001,...	POLYGON ((439924.9298999999 4472798.281400001,...	POLYGON ((443455.7175000003 4472836.7236, 4434...	POLYGON ((440659.7537000002 4473778.4014, 4406...	POLYGON ((452408.1383999996 4484644.699999999,...
tasa	913.88	504.32	551.97	766.19	610.32
casos	281	98	161	163	177
tasa_bin	800-1000	400-600	400-600	600-800	600-800
restricted	0	0	0	0	0
geometry_pt	POINT (438360.9828817752 4470073.107306919)	POINT (439858.3553289876 4472447.600519221)	POINT (442903.0377880885 4472532.970746173)	POINT (440836.3215957115 4473539.390589811)	POINT (451700.7212964218 4480650.657189403)

	zona_basic	tasa
274	Villa del Prado	1335.76
206	Pintores	1283.70
59	Comillas	1128.86
250	Sierra de Guadarrama	1123.82
78	El Naranjo	1118.60
237	San Isidro	1114.40
255	Torrelaguna	1107.82
51	Ciudad San Pablo	1101.67
268	Valleaguado	1081.95
134	Las Fronteras	1081.33
40	Canillejas	1071.17
16	Antonio Leyva	1023.47
162	María Curie	1016.51

	zona_basic	tasa
235	San Fermín	738.58
188	Orcasitas	788.33
74	El Espinillo	835.19
189	Orcasur	883.03
98	García Noblejas	889.98
108	Guayaba	898.50
90	Federica Montseny	900.03
254	Sánchez Morate	934.83
64	Daroca	950.19

	tasa	pob_densidad_sq_km
tasa	1.000000	0.380547
pob_densidad_sq_km	0.380547	1.000000