For Loop alternative to dynamically filter and summarise millions of points

Question

I have a dataset with 5 million records in this format:

library(dplyr)
    
Data <- tibble(X=runif(5000000, min=200000, max=400000),
                Y=runif(5000000, min=400000, max=500000),
                UID = 1:5000000,
                Count = runif(5000000, min=1, max=2000))

For each UID, I need to summarise 1) the number of other points 2) the count sum of the other points and 3) a density calculation of UIDs that are within 200m, 400m, 800m and 1600m. The final output required is a table with each UID and its summary output.

This for loop illustrates the output required, but would take weeks to complete:

Summary <- NULL

for (n in 1:nrow(Data)){
Data_Row <- Data[n,]

Row_200 <- Data %>% filter(between(X, Data_Row$X-200,Data_Row$X+200) &  between(Y, Data_Row$Y-200, Data_Row$Y+200)) %>%
    summarise(Instances_200 = n(), Count_200 = sum(Count), Calcualtion_200 = Count_200/(Instances_200*0.000625))

Row_400 <- Data %>% filter(between(X, Data_Row$X-400,Data_Row$X+400) &  between(Y, Data_Row$Y-400, Data_Row$Y+400)) %>%
    summarise(Instances_400 = n(), Count_400 = sum(Count), Calcualtion_400 = Count_400/(Instances_400*0.000625))

Row_800 <- Data %>% filter(between(X, Data_Row$X-800,Data_Row$X+800) &  between(Y, Data_Row$Y-800, Data_Row$Y+800)) %>%
    summarise(Instances_800 = n(), Count_800 = sum(Count), Calcualtion_800 = Count_800/(Instances_800*0.000625))

Row_1600 <- Data %>% filter(between(X, Data_Row$X-1600,Data_Row$X+1600) &  between(Y, Data_Row$Y-1600, Data_Row$Y+1600)) %>%
    summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calcualtion_1600 = Count_1600/(Instances_1600*0.000625))

Summary_Row <- bind_cols(Data[n,"UID"], Row_200, Row_400, Row_800, Row_1600)

Summary <- bind_rows(Summary, Summary_Row)
}

I am aware of things like lapply and map, but I don't know how they could be used to do this dynamic filtering and summarisation more quickly.

Chris · Accepted Answer

Based on @Jon Spring's comment I have used this answer as inspiration to speed up the process. The following answer now takes around 12 hours to run on my real data (as opposed to the days/weeks the code in my question would have taken). It is a bit memory hungry at points (I am using a Windows desktop with 32GB of RAM) which is why I have minimised what gets stored (i.e. why I write out each loops output to CSV and then bring them back in at the end). Using data.table has definitely speed things up, but I am sure there will still be a quicker/more efficient way to do this - but this does what I need (even if my code is messy!).

# geosphere v1.5-19 required
install.packages('geosphere', repos='https://rspatial.r-universe.dev')

# Required libraries
library(data.table)
library(dplyr)
library(geosphere)
library(tidyr)
library(readr)
library(purrr)
   
set.seed(123)
   
# Input data
Data <- data.table(
    X=runif(5000000, min=200000, max=400000),
    Y=runif(5000000, min=400000, max=500000),
    UID = 1:5000000,
    Count = sample(1:2000,5000000,replace=TRUE))

# Add 50m grid cell ID for each point
Data[, ID := OSGB(Data[,c("X", "Y")], "50m")]   

# Summarise number of records for each 50m grid cell
Data_Summary <- Data[, .(Count = .N), by=.(ID)]

# Generate centroid coorindates for 50 metre grid squares that cover 'Data' extent
Coordinates <- CJ(X = seq((floor(min(Data$X)/1000)*1000)+25, ceiling(max(Data$X)/1000)*1000, 50),
                Y = seq((floor(min(Data$Y)/1000)*1000)+25, ceiling(max(Data$Y)/1000)*1000, 50))

# Add 50m grid cell ID for each centroid
Coordinates[, ID := OSGB(Coordinates[,c("X", "Y")], "50m")]

# Join 50m grid cell centroid coordinates to 'Data_Summary'
Data_Summary_with_Centroids <- Coordinates[Data_Summary, on = .(ID), nomatch = NULL]    

# Define maximum buffer size
Buffer <- 1650

# Generate offset XY coordinates for each grid centroid
Data_Summary_with_Centroids[,`:=`(X_NW = X-Buffer, Y_NW = Y+Buffer, 
                        X_NE = X+Buffer, Y_NE = Y+Buffer,
                        X_SW = X-Buffer, Y_SW = Y-Buffer,
                        X_SE = X+Buffer, Y_SE = Y-Buffer)]

# Assign 5km grid cell ID to all pairs of coordinates                       
Data_Summary_with_Centroids[,`:=`(HashCN = OSGB(Data_Summary_with_Centroids[,c("X", "Y")], "5km"), 
                        HashNW = OSGB(Data_Summary_with_Centroids[,c("X_NW", "Y_NW")], "5km"),
                        HashNE = OSGB(Data_Summary_with_Centroids[,c("X_NE", "Y_NE")], "5km"),
                        HashSW = OSGB(Data_Summary_with_Centroids[,c("X_SW", "Y_SW")], "5km"),
                        HashSE = OSGB(Data_Summary_with_Centroids[,c("X_SE", "Y_SE")], "5km"))]

# Remove offset XY coordinates
Data_Summary_with_Centroids[, c("X_NW", "Y_NW","X_NE", "Y_NE","X_SW", "Y_SW","X_SE", "Y_SE"):=NULL] 

Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)

# List of all unique 5km grid cell IDs
Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)

XY_Columns <- c("X", "Y")

for (n in 1:length(Unique_5km_Hash)){
    
    # Subset so only grid cells that have that loops 5km grid cell ID in any of HashCN, HashNW, HashNE, HashSW or HashSE remain
    Hash <- Data_Summary_with_Centroids[Data_Summary_with_Centroids[, Reduce(`|`, lapply(.SD, `==`, Unique_5km_Hash[n])), 
    .SDcols = c("HashCN", "HashNW","HashNE","HashSW","HashSE")]][, c("HashNW", "HashNE","HashSW", "HashSE"):=NULL] 
    
    # Select only the columns needed for 'crossing'
    Hash_Cross <- Hash[, ..XY_Columns]

    # Expand to include all possible combinations of values
    suppressMessages(Hash_5km <- Hash_Cross %>% 
        crossing(., ., .name_repair = "unique"))
        
    rm(Hash_Cross)
    # Clear memory
    invisible(gc())
    
    # Calculate distances, disgard those over 1600m, keep only 'origin' records that are in 'HashCN' and add Count
    Hash_5km <- Hash_5km %>%
        mutate(Distance = sqrt((X...1 - X...3)^2 + (Y...2 - Y...4)^2)) %>%
        filter(Distance <=1600) %>%
        left_join(.,Hash %>% select(X, Y, ID, HashCN), join_by(X...1 == X, Y...2 == Y)) %>%
        filter(HashCN == Unique_5km_Hash[n]) %>%
        left_join(.,Hash %>% select(X, Y, Count), join_by(X...3 == X, Y...4 == Y)) %>%
        select(ID, Count, Distance)

    # Clear memory
    invisible(gc())

    # Create summary statistics of other grid cells within 200m
    Hash_200 <- Hash_5km %>%
        filter(Distance <=200) %>%
        group_by(ID) %>%
        summarise(Instances_200 = n(), Count_200 = sum(Count), Calculation_200 = Count_200/(Instances_200*0.25))

    # Create summary statistics of other grid cells within 400m
    Hash_400 <- Hash_5km %>%
        filter(Distance <=400) %>%
        group_by(ID) %>%
        summarise(Instances_400 = n(), Count_400 = sum(Count), Calculation_400 = Count_400/(Instances_400*0.25))

    # Create summary statistics of other grid cells within 800m 
    Hash_800 <- Hash_5km %>%
        filter(Distance <=800) %>%
        group_by(ID) %>%
        summarise(Instances_800 = n(), Count_800 = sum(Count), Calculation_800 = Count_800/(Instances_800*0.25))

    # Create summary statistics of other grid cells within 1600m    
    Hash_1600 <- Hash_5km %>%
        group_by(ID) %>%
        summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calculation_1600 = Count_1600/(Instances_1600*0.25))

    # Join summary statistics together
    Output <- Hash_200 %>%
        left_join(Hash_400, by='ID') %>%
        left_join(Hash_800, by='ID') %>%
        left_join(Hash_1600, by='ID') 

    # Export output
    write_csv(Output, paste0("C:/Temp/Hash/",Unique_5km_Hash[n],".csv"))

    rm(Hash_5km)
    rm(Hash)
    rm(Hash_200)
    rm(Hash_400)
    rm(Hash_800)
    rm(Hash_1600)
    rm(Output)
    invisible(gc())
}

# Read in all CSV files into single 
Grid_Calculations <- list.files( path="C:/Temp/Hash/", , pattern = "*.csv", full.names=TRUE ) %>% 
  map_dfr(read_csv, show_col_types = FALSE)

EDIT: Have added set.seed() and slightly changed the start of the loop so only the relevant columns are used with 'crossing' (this makes it memory efficient for me).

Output

First five rows of first loop:

ID	Instances_200	Count_200	Calculation_200	Instances_400	Count_400	Calculation_400	Instances_800	Count_800	Calculation_800	Instances_1600	Count_1600	Calculation_1600
SC550650NE	20	24	4.8	88	119	5.4090909090909	360	475	5.27777777777777	1503	2041	5.43180306054557
SC550650SW	22	28	5.09090909090909	87	119	5.47126436781609	360	475	5.27777777777777	1500	2049	5.464
SC550651NE	20	30	6	86	117	5.44186046511627	364	479	5.26373626373626	1494	2032	5.44042838018741
SC550651SE	21	30	5.71428571428571	85	113	5.31764705882352	364	479	5.26373626373626	1491	2037	5.46478873239436
SC550652NE	28	42	6	85	113	5.31764705882352	369	485	5.25745257452574	1487	2020	5.43375924680564

For Loop alternative to dynamically filter and summarise millions of points

Tags:

r

filter

dplyr

Chris

1 Answers

Chris

Recent Activity

Donate For Us

For Loop alternative to dynamically filter and summarise millions of points

Tags:

r

filter

dplyr

Chris

1 Answers

Chris

Related questions

Recent Activity

Donate For Us