The educabR package provides easy access to Brazilian public education data from INEP, FNDE, and CAPES. With simple functions, you can download and process data from 14 datasets:
Basic Education:
- IDEB - Basic Education Development Index
- ENEM - National High School Exam
- School Census (Censo Escolar)
- SAEB - Basic Education Assessment System
- ENCCEJA - Youth and Adult Education Certification
Exam
- ENEM by School - ENEM results aggregated by school
(2005-2015)
Higher Education:
- Higher Education Census (Censo da Educação
Superior)
- ENADE - National Student Performance Exam
- IDD - Value-Added Indicator
- CPC - Preliminary Course Concept
- IGC - General Courses Index
Graduate Education:
- CAPES - Graduate programs, students, faculty,
courses, and theses
Education Funding:
- FUNDEB - Resource distribution and enrollment
data
This vignette covers IDEB, ENEM, and the School Census. For other
datasets, see:
- vignette("basic-education-assessments") — SAEB, ENCCEJA,
ENEM by School
- vignette("higher-education") — Higher Education Census,
ENADE, IDD, CPC, IGC, CAPES
- vignette("education-funding") — FUNDEB
educabR caches downloaded files to avoid repeated downloads. By default, it uses a temporary directory. To persist data across sessions:
IDEB is the main indicator of basic education quality in Brazil, combining test performance (SAEB) with grade promotion rates.
# IDEB by school - Early elementary (1st-5th grade)
ideb_schools <- get_ideb(
year = 2021,
level = "escola",
stage = "anos_iniciais"
)
# IDEB by municipality - High school
ideb_municipalities <- get_ideb(
year = 2023,
level = "municipio",
stage = "ensino_medio"
)
# Filter by state (faster)
ideb_sp <- get_ideb(
year = 2021,
level = "escola",
stage = "anos_iniciais",
uf = "SP"
)# View structure
glimpse(ideb_schools)
#> Rows: 63,529
#> Columns: 17
#> $ sg_uf <chr> "RO", "RO", "RO", ...
#> $ co_municipio <dbl> 1100015, 1100015, ...
#> $ no_municipio <chr> "Alta Floresta D'Oeste", ...
#> $ id_escola <dbl> 11000023, 11000040, ...
#> $ no_escola <chr> "EEEE ABNAEL MACHADO DE LIMA", ...
#> $ rede <chr> "Estadual", "Municipal", ...
#> $ vl_aprovacao_2021_si_4 <dbl> 93.3, 98.5, 100, ...
#> $ vl_indicador_rend_2021 <dbl> 0.92, 0.98, 1.00, ...
#> $ vl_nota_matematica_2021 <dbl> 5.2, 5.8, 6.1, ...
#> $ vl_nota_portugues_2021 <dbl> 5.4, 5.9, 6.0, ...
#> $ vl_nota_media_2021 <dbl> 5.3, 5.85, 6.05, ...
#> $ vl_observado_2021 <dbl> 4.9, 5.7, 6.1, ...# Calculate average IDEB by state
ideb_by_state <-
ideb_schools |>
filter(!is.na(vl_observado_2021)) |>
group_by(sg_uf) |>
summarise(
n_schools = n(),
mean_ideb = mean(vl_observado_2021, na.rm = TRUE),
median_ideb = median(vl_observado_2021, na.rm = TRUE)
) |>
arrange(desc(mean_ideb))
# Plot
ggplot(ideb_by_state, aes(x = reorder(sg_uf, mean_ideb), y = mean_ideb)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Average IDEB by State - Early Elementary (2021)",
x = "State",
y = "Average IDEB"
) +
theme_minimal()# Download historical series
ideb_history <- get_ideb_series(
years = c(2017, 2019, 2021, 2023),
level = "municipio",
stage = "anos_iniciais"
)
# National trend
trend <-
ideb_history |>
group_by(ano_ideb) |>
summarise(mean_ideb = mean(vl_observado, na.rm = TRUE))
ggplot(trend, aes(x = ano_ideb, y = mean_ideb)) +
geom_line(color = "darkgreen", size = 1.2) +
geom_point(color = "darkgreen", size = 3) +
labs(
title = "IDEB Trend - Early Elementary",
x = "Year",
y = "National Average IDEB"
) +
theme_minimal()ENEM is Brazil’s largest exam, with millions of participants annually. The microdata includes demographics, socioeconomic data, and scores.
glimpse(enem_sample)
#> Rows: 10,000
#> Columns: 76
#> $ nu_inscricao <dbl> 230001234567, ...
#> $ nu_ano <dbl> 2023, 2023, ...
#> $ tp_faixa_etaria <dbl> 3, 4, 2, ...
#> $ tp_sexo <chr> "F", "M", "F", ...
#> $ tp_cor_raca <dbl> 1, 3, 2, ...
#> $ nu_nota_cn <dbl> 512.3, 489.1, ...
#> $ nu_nota_ch <dbl> 598.2, 567.4, ...
#> $ nu_nota_lc <dbl> 534.8, 502.1, ...
#> $ nu_nota_mt <dbl> 478.9, 521.3, ...
#> $ nu_nota_redacao <dbl> 720, 640, ...# Summary statistics for scores
enem_summary(enem_sample)
# Summary by sex
enem_summary(enem_sample, by = "tp_sexo")
# Average scores by race/ethnicity
scores_by_race <-
enem_sample |>
filter(!is.na(nu_nota_mt)) |>
mutate(
race = case_when(
tp_cor_raca == 0 ~ "Not declared",
tp_cor_raca == 1 ~ "White",
tp_cor_raca == 2 ~ "Black",
tp_cor_raca == 3 ~ "Mixed race",
tp_cor_raca == 4 ~ "Asian",
tp_cor_raca == 5 ~ "Indigenous"
)
) |>
group_by(race) |>
summarise(
n = n(),
mean_math = mean(nu_nota_mt, na.rm = TRUE),
mean_essay = mean(nu_nota_redacao, na.rm = TRUE)
)The School Census is Brazil’s main statistical survey on basic education, covering all public and private schools.
# The census contains over 400 variables per school!
glimpse(schools_2023)
#> Rows: 217,625
#> Columns: 408
#> $ nu_ano_censo <dbl> 2023, 2023, ...
#> $ sg_uf <chr> "RO", "RO", ...
#> $ co_uf <dbl> 11, 11, ...
#> $ no_municipio <chr> "Porto Velho", ...
#> $ co_municipio <dbl> 1100205, ...
#> $ no_entidade <chr> "EEEE ABNAEL MACHADO DE LIMA", ...
#> $ co_entidade <dbl> 11000023, ...
#> $ tp_dependencia <dbl> 2, 3, 4, ...
#> $ tp_localizacao <dbl> 1, 1, 1, ...# Count by administrative type
schools_by_type <-
schools_2023 |>
mutate(
admin_type = case_when(
tp_dependencia == 1 ~ "Federal",
tp_dependencia == 2 ~ "State",
tp_dependencia == 3 ~ "Municipal",
tp_dependencia == 4 ~ "Private"
)
) |>
count(admin_type) |>
mutate(pct = n / sum(n) * 100)
ggplot(schools_by_type, aes(x = reorder(admin_type, n), y = n, fill = admin_type)) +
geom_col() +
geom_text(aes(label = sprintf("%.1f%%", pct)), hjust = -0.1) +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Number of Schools by Administrative Type (2023)",
x = NULL,
y = "Number of Schools"
) +
theme_minimal() +
theme(legend.position = "none")# Check infrastructure availability in public schools
infra <-
schools_2023 |>
filter(tp_dependencia %in% c(2, 3)) |> # Public schools only
summarise(
pct_internet = mean(in_internet == 1, na.rm = TRUE) * 100,
pct_library = mean(in_biblioteca == 1, na.rm = TRUE) * 100,
pct_computer_lab = mean(in_laboratorio_informatica == 1, na.rm = TRUE) * 100,
pct_sports_court = mean(in_quadra_esportes == 1, na.rm = TRUE) * 100,
pct_accessibility = mean(in_acessibilidade == 1, na.rm = TRUE) * 100
)
print(infra)
#> # A tibble: 1 x 5
#> pct_internet pct_library pct_computer_lab pct_sports_court pct_accessibility
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 78.3 42.1 35.2 48.7 32.1For detailed information about variables, see the official documentation:
https://www.fnde.gov.brn_max for testing.Found a bug or want to suggest improvements? Open an issue on GitHub: https://github.com/SidneyBissoli/educabR/issues