df_clean <- df_raw %>%
ungroup() %>%
mutate(text_split=str_split(text_raw, regex("\r\n\\s*(?=\\d+\\.)"))) %>%
unnest_longer(text_split) %>%
mutate(text_split=text_split %>% str_squish() %>% str_trim()) %>%
mutate(text_split=str_split(text_split, ".(?=Zustellungsbevollmächtigte(r)? Vertreter(in)?)")) %>%
unnest_longer(text_split)
# get Listenplatz ------------------------------------------------------------
df_clean <- df_clean %>%
mutate(listenplatz=str_extract(text_split, regex("^\\d+\\.?\\s+(?!Bezirk)")) %>%
str_extract(., "\\d*") %>% as.numeric())
# get elections -----------------------------------------------------------
df_clean <- df_clean %>%
mutate(election=text_raw %>% str_extract(., regex("(?<=[A-Z]\\.)\\s*[A-z]+wahl(en)?", dotall = T)) %>%
str_trim(., side=c("both"))) %>%
tidyr::fill(election, .direction="down")
# electoral district --------------------------------------------------------------
df_clean <- df_clean %>%
mutate(wahlkreis=case_when(election=="Bezirksvertretungswahlen" ~ str_extract(text_split, "\\d{1,2}\\. Bezirk"),
election=="Gemeinderatswahl"~ str_extract(text_split, regex("Wahlkreis.*?(?=[:upper:]{2,}?)",
dotall = T,
multiline = T)),
election=="Stadtwahl" ~ as.character("Stadtwahl"),
TRUE ~ as.character("missing"))) %>%
mutate(wahlkreis=str_trim(wahlkreis, side=c("both"))) %>%
tidyr::fill(wahlkreis, .direction="down") %>%
mutate(wahlkreis=str_remove(wahlkreis, "Wahlkreis ") %>%
str_remove(., regex("\\(.*\\)")) %>%
str_trim(., side=c("both")))
# other -------------------------------------------------------------------
df_clean <- df_clean %>%
mutate(page=text_raw %>% str_extract(., regex("Seite \\d+")) %>% str_extract(., "\\d+") %>%
as.numeric()) %>%
mutate(name=str_extract(text_split, regex("(?<=\\d\\.\\s?).*?(?=,\\s?\\d{4},)")) %>%
str_trim(., side=c("both"))) %>%
mutate(first_name=text_split %>% str_extract(., regex("[:alpha:]*(?=,\\s?\\d+)"))) %>%
mutate(year_birth=text_split %>% str_extract(., regex("\\d{4}")) %>%
as.numeric()) %>%
mutate(year_interval=cut(year_birth, seq(1930, 2005, 5))) %>%
mutate(plz=text_split %>% str_extract(., regex("\\d{4}\\s(?=Wien)")) %>%
str_trim(., side=c("both")))
# get party ---------------------------------------------------------------
df_clean <- df_clean %>%
mutate(party=text_split %>% str_extract(., regex("(?<=^Zustellung)[:alpha:]*$"))) %>%
mutate(party=case_when(lead(listenplatz==1) ~ str_extract(text_split, regex("\\w+$")),
TRUE ~ NA_character_)) %>%
tidyr::fill(party, .direction = "down") %>%
mutate(party=party %>%
as_factor() %>%
fct_relevel(., sort) %>%
fct_relevel(., "SPÖ", "FPÖ", "GRÜNE", "ÖVP", "NEOS"))
# wrap up -----------------------------------------------------------------
df_clean <- df_clean %>%
mutate(wahlkreis_plz=str_extract(wahlkreis, regex("\\d+")) %>%
as.numeric()+100) %>%
mutate(wahlkreis_plz=wahlkreis_plz %>% as.character() %>% paste0(., "0")) %>%
mutate(wahlkreis_plz=case_when(str_detect(wahlkreis, "Zentrum") ~ "1010, 1040, 1050, 1060",
str_detect(wahlkreis, "Innen") ~ "1070, 1080, 1090",
str_detect(wahlkreis, "Leopoldstadt") ~ "1020",
str_detect(wahlkreis, "Landstraße") ~ "1030",
str_detect(wahlkreis, "Favoriten") ~ "1100",
str_detect(wahlkreis, "Simmering") ~ "1110",
str_detect(wahlkreis, "Meidling") ~ "1120",
str_detect(wahlkreis, "Hietzing") ~ "1130",
str_detect(wahlkreis, "Penzing") ~ "1140",
str_detect(wahlkreis, "Rudolf") ~ "1150",
str_detect(wahlkreis, "Ottakring") ~ "1160",
str_detect(wahlkreis, "Hernals") ~ "1170",
str_detect(wahlkreis, "Währing") ~ "1180",
str_detect(wahlkreis, "Döbling") ~ "1190",
str_detect(wahlkreis, "Brigittenau") ~ "1200",
str_detect(wahlkreis, "Floridsdorf") ~ "1210",
str_detect(wahlkreis, "Donaustadt") ~ "1220",
str_detect(wahlkreis, "Liesing") ~ "1230",
TRUE ~ as.character(wahlkreis_plz))) %>%
mutate(residence=case_when(
str_detect(wahlkreis_plz, plz) ~ "inside",
!str_detect(wahlkreis_plz, plz) ~ "outside",
TRUE ~ as.character("missing"))) #%>%
df_clean <- df_clean %>%
select(-text_raw) %>%
filter(!is.na(listenplatz))