Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

For Loop alternative to dynamically filter and summarise millions of points

Tags:

r

filter

dplyr

I have a dataset with 5 million records in this format:

library(dplyr)
    
Data <- tibble(X=runif(5000000, min=200000, max=400000),
                Y=runif(5000000, min=400000, max=500000),
                UID = 1:5000000,
                Count = runif(5000000, min=1, max=2000))

For each UID, I need to summarise 1) the number of other points 2) the count sum of the other points and 3) a density calculation of UIDs that are within 200m, 400m, 800m and 1600m. The final output required is a table with each UID and its summary output.

This for loop illustrates the output required, but would take weeks to complete:

Summary <- NULL

for (n in 1:nrow(Data)){
Data_Row <- Data[n,]

Row_200 <- Data %>% filter(between(X, Data_Row$X-200,Data_Row$X+200) &  between(Y, Data_Row$Y-200, Data_Row$Y+200)) %>%
    summarise(Instances_200 = n(), Count_200 = sum(Count), Calcualtion_200 = Count_200/(Instances_200*0.000625))

Row_400 <- Data %>% filter(between(X, Data_Row$X-400,Data_Row$X+400) &  between(Y, Data_Row$Y-400, Data_Row$Y+400)) %>%
    summarise(Instances_400 = n(), Count_400 = sum(Count), Calcualtion_400 = Count_400/(Instances_400*0.000625))

Row_800 <- Data %>% filter(between(X, Data_Row$X-800,Data_Row$X+800) &  between(Y, Data_Row$Y-800, Data_Row$Y+800)) %>%
    summarise(Instances_800 = n(), Count_800 = sum(Count), Calcualtion_800 = Count_800/(Instances_800*0.000625))

Row_1600 <- Data %>% filter(between(X, Data_Row$X-1600,Data_Row$X+1600) &  between(Y, Data_Row$Y-1600, Data_Row$Y+1600)) %>%
    summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calcualtion_1600 = Count_1600/(Instances_1600*0.000625))

Summary_Row <- bind_cols(Data[n,"UID"], Row_200, Row_400, Row_800, Row_1600)

Summary <- bind_rows(Summary, Summary_Row)
}

I am aware of things like lapply and map, but I don't know how they could be used to do this dynamic filtering and summarisation more quickly.

like image 367
Chris Avatar asked Oct 20 '25 03:10

Chris


1 Answers

Based on @Jon Spring's comment I have used this answer as inspiration to speed up the process. The following answer now takes around 12 hours to run on my real data (as opposed to the days/weeks the code in my question would have taken). It is a bit memory hungry at points (I am using a Windows desktop with 32GB of RAM) which is why I have minimised what gets stored (i.e. why I write out each loops output to CSV and then bring them back in at the end). Using data.table has definitely speed things up, but I am sure there will still be a quicker/more efficient way to do this - but this does what I need (even if my code is messy!).

# geosphere v1.5-19 required
install.packages('geosphere', repos='https://rspatial.r-universe.dev')

# Required libraries
library(data.table)
library(dplyr)
library(geosphere)
library(tidyr)
library(readr)
library(purrr)
   
set.seed(123)
   
# Input data
Data <- data.table(
    X=runif(5000000, min=200000, max=400000),
    Y=runif(5000000, min=400000, max=500000),
    UID = 1:5000000,
    Count = sample(1:2000,5000000,replace=TRUE))

# Add 50m grid cell ID for each point
Data[, ID := OSGB(Data[,c("X", "Y")], "50m")]   

# Summarise number of records for each 50m grid cell
Data_Summary <- Data[, .(Count = .N), by=.(ID)]

# Generate centroid coorindates for 50 metre grid squares that cover 'Data' extent
Coordinates <- CJ(X = seq((floor(min(Data$X)/1000)*1000)+25, ceiling(max(Data$X)/1000)*1000, 50),
                Y = seq((floor(min(Data$Y)/1000)*1000)+25, ceiling(max(Data$Y)/1000)*1000, 50))

# Add 50m grid cell ID for each centroid
Coordinates[, ID := OSGB(Coordinates[,c("X", "Y")], "50m")]

# Join 50m grid cell centroid coordinates to 'Data_Summary'
Data_Summary_with_Centroids <- Coordinates[Data_Summary, on = .(ID), nomatch = NULL]    

# Define maximum buffer size
Buffer <- 1650

# Generate offset XY coordinates for each grid centroid
Data_Summary_with_Centroids[,`:=`(X_NW = X-Buffer, Y_NW = Y+Buffer, 
                        X_NE = X+Buffer, Y_NE = Y+Buffer,
                        X_SW = X-Buffer, Y_SW = Y-Buffer,
                        X_SE = X+Buffer, Y_SE = Y-Buffer)]

# Assign 5km grid cell ID to all pairs of coordinates                       
Data_Summary_with_Centroids[,`:=`(HashCN = OSGB(Data_Summary_with_Centroids[,c("X", "Y")], "5km"), 
                        HashNW = OSGB(Data_Summary_with_Centroids[,c("X_NW", "Y_NW")], "5km"),
                        HashNE = OSGB(Data_Summary_with_Centroids[,c("X_NE", "Y_NE")], "5km"),
                        HashSW = OSGB(Data_Summary_with_Centroids[,c("X_SW", "Y_SW")], "5km"),
                        HashSE = OSGB(Data_Summary_with_Centroids[,c("X_SE", "Y_SE")], "5km"))]

# Remove offset XY coordinates
Data_Summary_with_Centroids[, c("X_NW", "Y_NW","X_NE", "Y_NE","X_SW", "Y_SW","X_SE", "Y_SE"):=NULL] 

Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)

# List of all unique 5km grid cell IDs
Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)

XY_Columns <- c("X", "Y")

for (n in 1:length(Unique_5km_Hash)){
    
    # Subset so only grid cells that have that loops 5km grid cell ID in any of HashCN, HashNW, HashNE, HashSW or HashSE remain
    Hash <- Data_Summary_with_Centroids[Data_Summary_with_Centroids[, Reduce(`|`, lapply(.SD, `==`, Unique_5km_Hash[n])), 
    .SDcols = c("HashCN", "HashNW","HashNE","HashSW","HashSE")]][, c("HashNW", "HashNE","HashSW", "HashSE"):=NULL] 
    
    # Select only the columns needed for 'crossing'
    Hash_Cross <- Hash[, ..XY_Columns]

    # Expand to include all possible combinations of values
    suppressMessages(Hash_5km <- Hash_Cross %>% 
        crossing(., ., .name_repair = "unique"))
        
    rm(Hash_Cross)
    # Clear memory
    invisible(gc())
    
    # Calculate distances, disgard those over 1600m, keep only 'origin' records that are in 'HashCN' and add Count
    Hash_5km <- Hash_5km %>%
        mutate(Distance = sqrt((X...1 - X...3)^2 + (Y...2 - Y...4)^2)) %>%
        filter(Distance <=1600) %>%
        left_join(.,Hash %>% select(X, Y, ID, HashCN), join_by(X...1 == X, Y...2 == Y)) %>%
        filter(HashCN == Unique_5km_Hash[n]) %>%
        left_join(.,Hash %>% select(X, Y, Count), join_by(X...3 == X, Y...4 == Y)) %>%
        select(ID, Count, Distance)

    # Clear memory
    invisible(gc())

    # Create summary statistics of other grid cells within 200m
    Hash_200 <- Hash_5km %>%
        filter(Distance <=200) %>%
        group_by(ID) %>%
        summarise(Instances_200 = n(), Count_200 = sum(Count), Calculation_200 = Count_200/(Instances_200*0.25))

    # Create summary statistics of other grid cells within 400m
    Hash_400 <- Hash_5km %>%
        filter(Distance <=400) %>%
        group_by(ID) %>%
        summarise(Instances_400 = n(), Count_400 = sum(Count), Calculation_400 = Count_400/(Instances_400*0.25))

    # Create summary statistics of other grid cells within 800m 
    Hash_800 <- Hash_5km %>%
        filter(Distance <=800) %>%
        group_by(ID) %>%
        summarise(Instances_800 = n(), Count_800 = sum(Count), Calculation_800 = Count_800/(Instances_800*0.25))

    # Create summary statistics of other grid cells within 1600m    
    Hash_1600 <- Hash_5km %>%
        group_by(ID) %>%
        summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calculation_1600 = Count_1600/(Instances_1600*0.25))

    # Join summary statistics together
    Output <- Hash_200 %>%
        left_join(Hash_400, by='ID') %>%
        left_join(Hash_800, by='ID') %>%
        left_join(Hash_1600, by='ID') 

    # Export output
    write_csv(Output, paste0("C:/Temp/Hash/",Unique_5km_Hash[n],".csv"))

    rm(Hash_5km)
    rm(Hash)
    rm(Hash_200)
    rm(Hash_400)
    rm(Hash_800)
    rm(Hash_1600)
    rm(Output)
    invisible(gc())
}

# Read in all CSV files into single 
Grid_Calculations <- list.files( path="C:/Temp/Hash/", , pattern = "*.csv", full.names=TRUE ) %>% 
  map_dfr(read_csv, show_col_types = FALSE)

EDIT: Have added set.seed() and slightly changed the start of the loop so only the relevant columns are used with 'crossing' (this makes it memory efficient for me).

Output

First five rows of first loop:

ID Instances_200 Count_200 Calculation_200 Instances_400 Count_400 Calculation_400 Instances_800 Count_800 Calculation_800 Instances_1600 Count_1600 Calculation_1600
SC550650NE 20 24 4.8 88 119 5.4090909090909 360 475 5.27777777777777 1503 2041 5.43180306054557
SC550650SW 22 28 5.09090909090909 87 119 5.47126436781609 360 475 5.27777777777777 1500 2049 5.464
SC550651NE 20 30 6 86 117 5.44186046511627 364 479 5.26373626373626 1494 2032 5.44042838018741
SC550651SE 21 30 5.71428571428571 85 113 5.31764705882352 364 479 5.26373626373626 1491 2037 5.46478873239436
SC550652NE 28 42 6 85 113 5.31764705882352 369 485 5.25745257452574 1487 2020 5.43375924680564
like image 132
Chris Avatar answered Oct 22 '25 18:10

Chris