Skip to contents

This vignette demonstrates the core research question motivating the consumptionsurveyindia package: does household food consumption predict healthcare spending? Specifically, does higher expenditure on edible oils, cereals, or processed foods at the state level correlate with higher out-of-pocket spending on cardiovascular disease?

Prerequisites

This analysis requires data from two companion packages:

  1. consumptionsurveyindia — food expenditure data (this package)
  2. indiahealthsurvey — NSS health survey data (Schedule 25.0)

Step 1: Get consumption data by state

con <- ics_connect("~/data/consumption_parquet")

# Food group expenditure shares by state for a recent pre-2022 round
# (to match health survey timing)
consumption_by_state <- ics_consumption_by_state(
  con,
  round = "HCES-2022-23",
  by_food_group = TRUE,
  min_obs = 50
)

# Pivot to wide: one row per state, columns = food group shares
consumption_wide <- consumption_by_state |>
  group_by(state_name) |>
  mutate(total = sum(total_value)) |>
  ungroup() |>
  mutate(share = total_value / total) |>
  select(state_name, food_group, share) |>
  tidyr::pivot_wider(
    names_from = food_group, values_from = share,
    names_prefix = "share_"
  )

head(consumption_wide)
#> # A tibble: 6 × 14
#>   state_name  `share_Fruits (Dry)` share_Egg, Fish & Me…¹ share_Milk & Milk Pr…²
#>   <chr>                      <dbl>                  <dbl>                  <dbl>
#> 1 Andaman & …               0.0149                 0.135                  0.0770
#> 2 Andhra Pra…               0.0276                 0.105                  0.111 
#> 3 Arunachal …               0.0209                 0.170                  0.0715
#> 4 Assam                     0.0169                 0.155                  0.0670
#> 5 Bihar                     0.0173                 0.0768                 0.113 
#> 6 Chandigarh                0.0217                 0.0446                 0.258 
#> # ℹ abbreviated names: ¹​`share_Egg, Fish & Meat`, ²​`share_Milk & Milk Products`
#> # ℹ 10 more variables: `share_Edible Oil` <dbl>, `share_Fruits (Fresh)` <dbl>,
#> #   share_Vegetables <dbl>, share_Pulses <dbl>,
#> #   `share_Packaged Processed Food` <dbl>, share_Cereals <dbl>,
#> #   share_Spices <dbl>, `share_Served Processed Food` <dbl>,
#> #   share_Beverages <dbl>, `share_Sugar & Salt` <dbl>

Step 2: Get health expenditure data

library(indiahealthsurvey)

hcon <- ihs_connect("~/data/nss_health")

# State-level mean hospitalization expenditure by disease category
health_by_state <- ihs_state_summary(hcon, "hospitalization",
  by_disease = TRUE
)

# Pivot to wide: one row per state, one column per disease
health_wide <- health_by_state |>
  select(state_name, disease_category, mean_exp) |>
  tidyr::pivot_wider(
    names_from = disease_category, values_from = mean_exp,
    names_prefix = "health_"
  )

ihs_disconnect(hcon)
cat(sprintf("%d states with health expenditure data\n", nrow(health_wide)))
#> 37 states with health expenditure data
# Join on state_name
linked <- inner_join(consumption_wide, health_wide, by = "state_name")
cat(sprintf("Linked %d states with both consumption and health data\n", nrow(linked)))
#> Linked 34 states with both consumption and health data

Step 4: The Core Question — Oil vs CVD

Does higher edible oil expenditure share predict higher cardiovascular disease hospitalization costs?

if ("share_Edible Oil" %in% names(linked) && "health_Cardiovascular" %in% names(linked)) {
  ggplot(linked, aes(x = `share_Edible Oil`, y = health_Cardiovascular)) +
    geom_point(size = 3, colour = "#D55E00") +
    geom_smooth(method = "lm", se = TRUE, colour = "#0072B2", alpha = 0.2) +
    geom_text(aes(label = state_name), size = 2.5, hjust = -0.1, vjust = -0.5) +
    scale_x_continuous(labels = scales::percent_format()) +
    labs(
      title = "Edible Oil Expenditure vs. Cardiovascular Disease Costs",
      subtitle = "State-level ecological analysis",
      x = "Edible oil share of food expenditure",
      y = "Mean CVD hospitalization expenditure (Rs.)",
      caption = "Sources: HCES 2022-23 (consumption), NSS Health Survey (expenditure)"
    ) +
    theme_minimal(base_size = 12)
}

Step 5: Multiple Diet-Disease Correlations

Which food groups have the strongest association with which disease categories?

# Compute correlation between food group shares and disease expenditures
food_cols <- grep("^share_", names(linked), value = TRUE)
health_cols <- grep("^health_", names(linked), value = TRUE)

cor_matrix <- cor(
  linked[, food_cols],
  linked[, health_cols],
  use = "pairwise.complete.obs"
)

# Plot as heatmap
cor_df <- as.data.frame(as.table(cor_matrix))
names(cor_df) <- c("food_group", "disease", "correlation")
cor_df$food_group <- gsub("share_", "", cor_df$food_group)
cor_df$disease <- gsub("health_", "", cor_df$disease)

ggplot(cor_df, aes(x = disease, y = food_group, fill = correlation)) +
  geom_tile(colour = "white", linewidth = 0.5) +
  geom_text(aes(label = sprintf("%.2f", correlation)), size = 3) +
  scale_fill_gradient2(
    low = "#2166AC", mid = "white", high = "#B2182B",
    midpoint = 0, limits = c(-1, 1)
  ) +
  labs(
    title = "Correlation: Food Group Expenditure Shares vs. Disease Costs",
    subtitle = "State-level Pearson correlation",
    x = "Disease category",
    y = "Food group",
    fill = "r"
  ) +
  theme_minimal(base_size = 11) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Step 6: Cereals + Oil + Processed Foods → NCD Burden

The literature suggests three dietary risk factors for non-communicable diseases: high cereal/refined carbohydrate intake, high edible oil consumption, and rising processed food spending.

if (all(c("share_Cereals", "share_Edible Oil") %in% names(linked))) {
  linked$ncd_risk_score <- linked$`share_Cereals` * 0.4 +
    linked$`share_Edible Oil` * 0.4 +
    ifelse("share_Packaged & processed foods" %in% names(linked),
      linked$`share_Packaged & processed foods` * 0.2, 0
    )

  ncd_cols <- grep("Cardio|Diabetes", names(linked), value = TRUE)
  if (length(ncd_cols) > 0) {
    linked$ncd_cost <- rowMeans(linked[, ncd_cols], na.rm = TRUE)

    ggplot(linked, aes(x = ncd_risk_score, y = ncd_cost)) +
      geom_point(size = 3, colour = "#D55E00") +
      geom_smooth(method = "lm", se = TRUE, colour = "#0072B2", alpha = 0.2) +
      geom_text(aes(label = state_name), size = 2.5, hjust = -0.1, vjust = -0.5) +
      labs(
        title = "Dietary NCD Risk Score vs. NCD Healthcare Costs",
        subtitle = "Risk = 0.4×Cereals + 0.4×Oil + 0.2×Processed foods",
        x = "Dietary NCD risk score",
        y = "Mean NCD hospitalization cost (Rs.)"
      ) +
      theme_minimal(base_size = 12)
  }
}

Caveats

This is an ecological analysis — it correlates state-level averages, not individual household data. Correlations do not imply causation. Confounders include income, urbanisation, healthcare access, and genetic predisposition.

The HCES and NSS health surveys sample different households. Linkage is at the state level only (not household or district level).

The HCES 2022-23 uses a different methodology than earlier rounds (three-visit design), so cross-era comparisons require caution.

Cleanup

Data Sources

  • Consumption: HCES 2022-23 (MOSPI, Government of India)
  • Health: NSS Health Survey Schedule 25.0 (MOSPI)
  • Downloaded from microdata.gov.in