R survey data add weights

Question

I have survey data with weights already assigned. Here is a toy sample. If I don't weight the data, it will not be an accurate representation of the population. Is there a simple way to factor in the weights? Given that the weight is how many people in population X are represented by a given person in my sample, do I multiply obs * weight? I checked on youtube but I don't have strata data referred to in the video, my data are all individuals and did not have any luck searching elsewhere.

Thank you in advance for any help

ID <-  c(1,2,3,4,5,6,7,8,9,10,11,12,
     13,14,15,16,17,18,19,20,21,22,23,24,
     25,26,27,28,29,30,31,32,33,34,35,36)
year <- c(1980,1980,1980,1981,1982,1982,1980,1980,1981,1981,1982,1982,
      1980,1980,1980,1981,1982,1982,1980,1980,1981,1981,1982,1982,
      1980,1980,1980,1981,1982,1982,1980,1980,1981,1981,1982,1982)
city <- c("NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY",
      "NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY",
      "CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA")
district <- c(1,2,1,2,1,2,1,2,1,2,1,2,
          1,2,1,2,1,2,1,2,1,2,1,2,
          1,2,1,2,1,2,1,2,1,2,1,2)
weight <- c(100,17,25,1,100,52,10,5,90,10,10,15,
        13,1,25,1,6,52,10,5,90,7,10,15,
        1,2,3,4,5,6,10,20,3,40,50,6)

df <- data.frame(ID,year,city,district,weight)

df$year <- as.factor(df$year)
df$district <- as.factor(df$district)

print(df)


# aggregate count by year, city and district
a <- aggregate(ID ~ year + city + district, 
            data = df,  
            FUN = length) 

colnames(a)[colnames(a) == 'ID'] <- 'numObs'

# add new Total column, populate total by year
a[a$year == 1980, "TotalforYear"] <-  with(a, sum(numObs[year == 1980]))
a[a$year == 1981, "TotalforYear"] <-  with(a, sum(numObs[year == 1981]))  
a[a$year == 1982, "TotalforYear"] <-  with(a, sum(numObs[year == 1982]))

# add new column called share and initialize it
a[ , 'share'] <- 0
a$share = (a$numObs/a$TotalforYear)

# add group column
a[a$district == 1 & a$city == 'NY', "group"] <- 'NY district1'
a[a$district == 2 & a$city == 'NY', "group"] <- 'NY district2'
a[a$district == 1 & a$city == 'CA', "group"] <- 'CA district1'
a[a$district == 2 & a$city == 'CA', "group"] <- 'CA district2'

a

g1 <- ggplot(a, aes(x=year, y=share, group=group)) +
  geom_line(aes(linetype=group), color = 'red', size = 0.8) +
  geom_point(aes(shape=group), size = 2) +
  theme_classic() +  
  theme(legend.position="bottom",
    panel.grid.major.x = element_line( linewidth=.1, color="grey93" ), 
    panel.grid.major.y = element_line( linewidth=.1, color="grey93" ) )       +
  ylab("Share") +
  scale_linetype_manual(values=c("solid", "dotted", 'dashed', 'longdash')) +
  theme(legend.title=element_blank()) 
g1

Thomas Lumley · Accepted Answer

A more general answer is to use the survey package or its srvyr tidyverse wrapper. This lets you do all sorts of weighted analyses and also get valid uncertainty estimates.

pachadotdev · Answer

Edited to reflect the changes to the question.

In Stata, you would use "pweight" or "fweight" depending on your analysis needs, and the principle in R is similar: to adjust the data by the weights to reflect the sample's representation in the population.

In your case you have frequency weights, and you can multiply each observation by its weight to reflect the population count in your aggregation, and then aggregate the weighted counts by the desired variables.

From the possible partitions (or groups), you can take year, city, and favFood, summing up the weights. The result gives you a representation of the population’s preferences rather than just the sample and one way to do it is:

library(ggplot2)

ID <- c(
  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
  13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36
)
year <- c(
  1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
  1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
  1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982
)
city <- c(
  "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
  "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
  "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA"
)
district <- c(
  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
)
weight <- c(
  100, 17, 25, 1, 100, 52, 10, 5, 90, 10, 10, 15,
  13, 1, 25, 1, 6, 52, 10, 5, 90, 7, 10, 15,
  1, 2, 3, 4, 5, 6, 10, 20, 3, 40, 50, 6
)

df <- data.frame(ID, year, city, district, weight)

df$year <- as.factor(df$year)
df$district <- as.factor(df$district)

a <- aggregate(weight ~ year + city + district, 
               data = df, 
               FUN = sum)

colnames(a)[colnames(a) == 'weight'] <- 'weightedCount'

a[a$year == 1980, "TotalWeighted"] <- with(a, sum(weightedCount[year == 1980]))
a[a$year == 1981, "TotalWeighted"] <- with(a, sum(weightedCount[year == 1981]))
a[a$year == 1982, "TotalWeighted"] <- with(a, sum(weightedCount[year == 1982]))

a$share <- a$weightedCount / a$TotalWeighted

a[a$district == 1 & a$city == 'NY', "group"] <- 'NY district1'
a[a$district == 2 & a$city == 'NY', "group"] <- 'NY district2'
a[a$district == 1 & a$city == 'CA', "group"] <- 'CA district1'
a[a$district == 2 & a$city == 'CA', "group"] <- 'CA district2'

# g2: g1 with modifications to include weights

g2 <- ggplot(a, aes(x=year, y=share, group=group)) +
  geom_line(aes(linetype=group), color = 'blue', linewidth = 0.8) +
  geom_point(aes(shape=group), size = 2) +
  theme_classic() +  
  theme(legend.position="bottom",
        panel.grid.major.x = element_line(linewidth=.1, color="grey93"), 
        panel.grid.major.y = element_line(linewidth=.1, color="grey93")) +
  ylab("Weighted Share") +
  scale_linetype_manual(values=c("solid", "dotted", 'dashed', 'longdash')) +
  theme(legend.title=element_blank()) 

g2

I prefer to use the Tidyverse, and that would be

library(ggplot2)
library(dplyr)

df <- tibble(
  ID = c(
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
    25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36
  ),
  year = c(
    1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
    1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
    1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982
  ),
  city = c(
    "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
    "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
    "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA"
  ),
  district = c(
    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
  ),
  weight = c(
    100, 17, 25, 1, 100, 52, 10, 5, 90, 10, 10, 15,
    13, 1, 25, 1, 6, 52, 10, 5, 90, 7, 10, 15,
    1, 2, 3, 4, 5, 6, 10, 20, 3, 40, 50, 6
  )
)

df <- df %>%
  mutate(
    year = as.factor(year),
    district = as.factor(district)
  )

a <- df %>%
  group_by(year, city, district) %>%
  summarise(weightedCount = sum(weight), .groups = "drop") %>%
  mutate(
    TotalWeighted = sum(weightedCount, na.rm = TRUE),
    share = weightedCount / TotalWeighted,
    group = case_when(
      district == 1 & city == "NY" ~ "NY district1",
      district == 2 & city == "NY" ~ "NY district2",
      district == 1 & city == "CA" ~ "CA district1",
      district == 2 & city == "CA" ~ "CA district2"
    )
  )

a

# g2: g1 with modifications to include weights

g2 <- ggplot(a, aes(x=year, y=share, group=group)) +
  geom_line(aes(linetype=group), color = 'blue', linewidth = 0.8) +
  geom_point(aes(shape=group), size = 2) +
  theme_classic() +  
  theme(legend.position="bottom",
        panel.grid.major.x = element_line(linewidth=.1, color="grey93"), 
        panel.grid.major.y = element_line(linewidth=.1, color="grey93")) +
  ylab("Weighted Share") +
  scale_linetype_manual(values=c("solid", "dotted", 'dashed', 'longdash')) +
  theme(legend.title=element_blank()) 

g2

enter image description here

R survey data add weights

Tags:

r

sample-data

weighted

survey

ithoughtso

2 Answers

Thomas Lumley

pachadotdev

Recent Activity

Donate For Us

R survey data add weights

Tags:

r

sample-data

weighted

survey

ithoughtso

2 Answers

Thomas Lumley

pachadotdev

Related questions

Recent Activity

Donate For Us