I have a dataset with 5 million records in this format:
library(dplyr)
Data <- tibble(X=runif(5000000, min=200000, max=400000),
Y=runif(5000000, min=400000, max=500000),
UID = 1:5000000,
Count = runif(5000000, min=1, max=2000))
For each UID, I need to summarise 1) the number of other points 2) the count sum of the other points and 3) a density calculation of UIDs that are within 200m, 400m, 800m and 1600m. The final output required is a table with each UID and its summary output.
This for loop illustrates the output required, but would take weeks to complete:
Summary <- NULL
for (n in 1:nrow(Data)){
Data_Row <- Data[n,]
Row_200 <- Data %>% filter(between(X, Data_Row$X-200,Data_Row$X+200) & between(Y, Data_Row$Y-200, Data_Row$Y+200)) %>%
summarise(Instances_200 = n(), Count_200 = sum(Count), Calcualtion_200 = Count_200/(Instances_200*0.000625))
Row_400 <- Data %>% filter(between(X, Data_Row$X-400,Data_Row$X+400) & between(Y, Data_Row$Y-400, Data_Row$Y+400)) %>%
summarise(Instances_400 = n(), Count_400 = sum(Count), Calcualtion_400 = Count_400/(Instances_400*0.000625))
Row_800 <- Data %>% filter(between(X, Data_Row$X-800,Data_Row$X+800) & between(Y, Data_Row$Y-800, Data_Row$Y+800)) %>%
summarise(Instances_800 = n(), Count_800 = sum(Count), Calcualtion_800 = Count_800/(Instances_800*0.000625))
Row_1600 <- Data %>% filter(between(X, Data_Row$X-1600,Data_Row$X+1600) & between(Y, Data_Row$Y-1600, Data_Row$Y+1600)) %>%
summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calcualtion_1600 = Count_1600/(Instances_1600*0.000625))
Summary_Row <- bind_cols(Data[n,"UID"], Row_200, Row_400, Row_800, Row_1600)
Summary <- bind_rows(Summary, Summary_Row)
}
I am aware of things like lapply
and map
, but I don't know how they could be used to do this dynamic filtering and summarisation more quickly.
Based on @Jon Spring's comment I have used this answer as inspiration to speed up the process. The following answer now takes around 12 hours to run on my real data (as opposed to the days/weeks the code in my question would have taken). It is a bit memory hungry at points (I am using a Windows desktop with 32GB of RAM) which is why I have minimised what gets stored (i.e. why I write out each loops output to CSV and then bring them back in at the end). Using data.table
has definitely speed things up, but I am sure there will still be a quicker/more efficient way to do this - but this does what I need (even if my code is messy!).
# geosphere v1.5-19 required
install.packages('geosphere', repos='https://rspatial.r-universe.dev')
# Required libraries
library(data.table)
library(dplyr)
library(geosphere)
library(tidyr)
library(readr)
library(purrr)
set.seed(123)
# Input data
Data <- data.table(
X=runif(5000000, min=200000, max=400000),
Y=runif(5000000, min=400000, max=500000),
UID = 1:5000000,
Count = sample(1:2000,5000000,replace=TRUE))
# Add 50m grid cell ID for each point
Data[, ID := OSGB(Data[,c("X", "Y")], "50m")]
# Summarise number of records for each 50m grid cell
Data_Summary <- Data[, .(Count = .N), by=.(ID)]
# Generate centroid coorindates for 50 metre grid squares that cover 'Data' extent
Coordinates <- CJ(X = seq((floor(min(Data$X)/1000)*1000)+25, ceiling(max(Data$X)/1000)*1000, 50),
Y = seq((floor(min(Data$Y)/1000)*1000)+25, ceiling(max(Data$Y)/1000)*1000, 50))
# Add 50m grid cell ID for each centroid
Coordinates[, ID := OSGB(Coordinates[,c("X", "Y")], "50m")]
# Join 50m grid cell centroid coordinates to 'Data_Summary'
Data_Summary_with_Centroids <- Coordinates[Data_Summary, on = .(ID), nomatch = NULL]
# Define maximum buffer size
Buffer <- 1650
# Generate offset XY coordinates for each grid centroid
Data_Summary_with_Centroids[,`:=`(X_NW = X-Buffer, Y_NW = Y+Buffer,
X_NE = X+Buffer, Y_NE = Y+Buffer,
X_SW = X-Buffer, Y_SW = Y-Buffer,
X_SE = X+Buffer, Y_SE = Y-Buffer)]
# Assign 5km grid cell ID to all pairs of coordinates
Data_Summary_with_Centroids[,`:=`(HashCN = OSGB(Data_Summary_with_Centroids[,c("X", "Y")], "5km"),
HashNW = OSGB(Data_Summary_with_Centroids[,c("X_NW", "Y_NW")], "5km"),
HashNE = OSGB(Data_Summary_with_Centroids[,c("X_NE", "Y_NE")], "5km"),
HashSW = OSGB(Data_Summary_with_Centroids[,c("X_SW", "Y_SW")], "5km"),
HashSE = OSGB(Data_Summary_with_Centroids[,c("X_SE", "Y_SE")], "5km"))]
# Remove offset XY coordinates
Data_Summary_with_Centroids[, c("X_NW", "Y_NW","X_NE", "Y_NE","X_SW", "Y_SW","X_SE", "Y_SE"):=NULL]
Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)
# List of all unique 5km grid cell IDs
Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)
XY_Columns <- c("X", "Y")
for (n in 1:length(Unique_5km_Hash)){
# Subset so only grid cells that have that loops 5km grid cell ID in any of HashCN, HashNW, HashNE, HashSW or HashSE remain
Hash <- Data_Summary_with_Centroids[Data_Summary_with_Centroids[, Reduce(`|`, lapply(.SD, `==`, Unique_5km_Hash[n])),
.SDcols = c("HashCN", "HashNW","HashNE","HashSW","HashSE")]][, c("HashNW", "HashNE","HashSW", "HashSE"):=NULL]
# Select only the columns needed for 'crossing'
Hash_Cross <- Hash[, ..XY_Columns]
# Expand to include all possible combinations of values
suppressMessages(Hash_5km <- Hash_Cross %>%
crossing(., ., .name_repair = "unique"))
rm(Hash_Cross)
# Clear memory
invisible(gc())
# Calculate distances, disgard those over 1600m, keep only 'origin' records that are in 'HashCN' and add Count
Hash_5km <- Hash_5km %>%
mutate(Distance = sqrt((X...1 - X...3)^2 + (Y...2 - Y...4)^2)) %>%
filter(Distance <=1600) %>%
left_join(.,Hash %>% select(X, Y, ID, HashCN), join_by(X...1 == X, Y...2 == Y)) %>%
filter(HashCN == Unique_5km_Hash[n]) %>%
left_join(.,Hash %>% select(X, Y, Count), join_by(X...3 == X, Y...4 == Y)) %>%
select(ID, Count, Distance)
# Clear memory
invisible(gc())
# Create summary statistics of other grid cells within 200m
Hash_200 <- Hash_5km %>%
filter(Distance <=200) %>%
group_by(ID) %>%
summarise(Instances_200 = n(), Count_200 = sum(Count), Calculation_200 = Count_200/(Instances_200*0.25))
# Create summary statistics of other grid cells within 400m
Hash_400 <- Hash_5km %>%
filter(Distance <=400) %>%
group_by(ID) %>%
summarise(Instances_400 = n(), Count_400 = sum(Count), Calculation_400 = Count_400/(Instances_400*0.25))
# Create summary statistics of other grid cells within 800m
Hash_800 <- Hash_5km %>%
filter(Distance <=800) %>%
group_by(ID) %>%
summarise(Instances_800 = n(), Count_800 = sum(Count), Calculation_800 = Count_800/(Instances_800*0.25))
# Create summary statistics of other grid cells within 1600m
Hash_1600 <- Hash_5km %>%
group_by(ID) %>%
summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calculation_1600 = Count_1600/(Instances_1600*0.25))
# Join summary statistics together
Output <- Hash_200 %>%
left_join(Hash_400, by='ID') %>%
left_join(Hash_800, by='ID') %>%
left_join(Hash_1600, by='ID')
# Export output
write_csv(Output, paste0("C:/Temp/Hash/",Unique_5km_Hash[n],".csv"))
rm(Hash_5km)
rm(Hash)
rm(Hash_200)
rm(Hash_400)
rm(Hash_800)
rm(Hash_1600)
rm(Output)
invisible(gc())
}
# Read in all CSV files into single
Grid_Calculations <- list.files( path="C:/Temp/Hash/", , pattern = "*.csv", full.names=TRUE ) %>%
map_dfr(read_csv, show_col_types = FALSE)
EDIT: Have added set.seed() and slightly changed the start of the loop so only the relevant columns are used with 'crossing' (this makes it memory efficient for me).
Output
First five rows of first loop:
ID | Instances_200 | Count_200 | Calculation_200 | Instances_400 | Count_400 | Calculation_400 | Instances_800 | Count_800 | Calculation_800 | Instances_1600 | Count_1600 | Calculation_1600 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
SC550650NE | 20 | 24 | 4.8 | 88 | 119 | 5.4090909090909 | 360 | 475 | 5.27777777777777 | 1503 | 2041 | 5.43180306054557 |
SC550650SW | 22 | 28 | 5.09090909090909 | 87 | 119 | 5.47126436781609 | 360 | 475 | 5.27777777777777 | 1500 | 2049 | 5.464 |
SC550651NE | 20 | 30 | 6 | 86 | 117 | 5.44186046511627 | 364 | 479 | 5.26373626373626 | 1494 | 2032 | 5.44042838018741 |
SC550651SE | 21 | 30 | 5.71428571428571 | 85 | 113 | 5.31764705882352 | 364 | 479 | 5.26373626373626 | 1491 | 2037 | 5.46478873239436 |
SC550652NE | 28 | 42 | 6 | 85 | 113 | 5.31764705882352 | 369 | 485 | 5.25745257452574 | 1487 | 2020 | 5.43375924680564 |
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With