Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Finding Out 5 Closest Points To Each Point

Suppose I have the following two data frames:

set.seed(123)

df_1 <- data.frame(
  name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
  lon = rnorm(15, mean = -74.0060, sd = 0.01),
  lat = rnorm(15, mean = 40.7128, sd = 0.01)
)

df_2 <- data.frame(
  name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
  lon = rnorm(14, mean = -74.0060, sd = 0.01),
  lat = rnorm(14, mean = 40.7128, sd = 0.01)
)

My Problem: For each person in df_1, I am trying to find out the 5 closest people (haversine distance) to this person in df_2, and record various distance statistics (e.g. mean, median, max, min standard deviation).

Attempt

First, I defined the distance function:

library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(c(lon1, lat1), c(lon2, lat2))
}

Then, I calculated the distance between each person in df_1 and all people in df_2:

# Create a matrix to store results
distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))

# calculate the distances
for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
        distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
}

# Create final
final <- data.frame(
    name_1 = rep(df_1$name_1, each = nrow(df_2)),
    lon_1 = rep(df_1$lon, each = nrow(df_2)),
    lat_1 = rep(df_1$lat, each = nrow(df_2)),
    name_2 = rep(df_2$name_2, nrow(df_1)),
    lon_2 = rep(df_2$lon, nrow(df_1)),
    lat_2 = rep(df_2$lat, nrow(df_1)),
    distance = c(distances)
)

Finally, for each person in df_1, I kept the 5 minimum distances and recorded the distance statistics:

# Keep only first 5 rows for each unique value of final$name_1
final <- final[order(final$name_1, final$distance), ]
final <- final[ave(final$distance, final$name_1, FUN = seq_along) <= 5, ]


# Calculate summary statistics for each unique person in final$name_1
final_summary <- aggregate(distance ~ name_1,
                           data = final,
                           FUN = function(x) c(min = min(x),
                                               max = max(x),
                                               mean = mean(x),
                                               median = median(x),
                                               sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")


final_summary$closest_people <- tapply(final$name_2,
                                       final$name_1,
                                       FUN = function(x) paste(sort(x), collapse = ", "))


# break closest_people column into multiple columns
n <- 5
closest_people_split <- strsplit(final_summary$closest_people, ", ")
final_summary[paste0("closest_", seq_len(n))] <- do.call(rbind, closest_people_split)

The final result look like this:

  name_1 min_distance max_distance mean_distance median_distance sd_distance                          closest_people closest_1 closest_2 closest_3 closest_4 closest_5
1   alex     342.8375    1158.1408      717.0810        650.9167    358.7439     boris, david, matthew, nigel, sarah     boris     david   matthew     nigel     sarah
2  chris     195.4891    1504.8199      934.6618        895.8301    489.5175     boris, david, malcolm, nigel, steph     boris     david   malcolm     nigel     steph
3  david     549.4500     830.2758      716.3839        807.6626    143.9571      matthew, sarah, steph, sylvia, tim   matthew     sarah     steph    sylvia       tim
4  henry     423.1875     975.1733      639.5657        560.1101    223.2389    anna, boris, matthew, sebastian, tim      anna     boris   matthew sebastian       tim
5   john     415.8956    1174.1631      849.4313        965.2928    313.2616      boris, julie, matthew, theo, tyler     boris     julie   matthew      theo     tyler
6  kelly     489.7949     828.5550      657.5908        658.7015    120.6485 david, julie, matthew, sebastian, steph     david     julie   matthew sebastian     steph

My Question: Although this code seems to run without errors, I have the feeling that this code will start to take a long time to run when the sizes of df_1 and df_2 start to grow. Hence, I am looking for ways to improve the efficiency of this code. Can someone please suggest routines for large data frames?

like image 746
stats_noob Avatar asked Oct 28 '25 13:10

stats_noob


2 Answers

A data.table approach to this problem might be as follows:

funcs <- function(d,n) {
  c(setNames(lapply(c(min,max,mean,median,sd), \(f) f(d)), c("min", "max", "mean", "median", "sd")),
    list("names" = paste0(n, collapse=", "))
  )
}

library(data.table)

setDT(cross_join(df_1, df_2))[
  ,dist:=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y)), .(name_1, name_2)
][order(dist), .SD[1:5, funcs(dist, name_2)], name_1]

Output:

     name_1       min       max      mean    median        sd                                  names
 1:  taylor  170.5171  746.6206  470.0857  439.8022 227.39141    david, tim, nigel, sarah, sebastian
 2:   peter  195.4891 1455.0204  834.2543  830.2758 539.69009     steph, boris, matthew, anna, david
 3:     tom  243.6729  530.4778  426.2490  447.8639 110.26649    tim, sebastian, julie, nigel, david
 4:    ryan  342.8375 1243.7473  970.0721 1052.6759 367.08513 tyler, julie, sebastian, sylvia, nigel
 5:   henry  394.8684  894.5358  647.1996  670.9220 236.69562     anna, matthew, david, steph, boris
 6:    john  423.1875 1948.9521 1106.4374 1052.8789 674.69139     boris, steph, matthew, anna, david
 7:   kelly  491.6430 1130.9239  717.7716  658.7015 248.96974     sylvia, tyler, sarah, nigel, julie
 8:  trevor  520.1834  650.9167  609.4363  631.9494  52.96026    nigel, sarah, julie, tim, sebastian
 9:    troy  549.4500 1035.0599  782.8799  828.5550 220.72034      tyler, sylvia, sarah, nigel, theo
10: michael  581.9209 1504.5642 1057.1773 1012.5247 378.81712      theo, tyler, sylvia, sarah, nigel
11:   david  602.9369  941.3102  752.1558  715.3872 159.37550      nigel, sarah, david, sylvia, anna
12:   kevin  638.9259  834.5504  715.5252  644.2898 102.23793     matthew, anna, david, nigel, steph
13:  xavier  972.9730 1767.1953 1369.5604 1396.8569 371.03190    julie, sebastian, tim, tyler, david
14:   chris 1389.1659 2106.7084 1644.0448 1455.8430 316.31565     julie, tyler, sebastian, tim, theo
15:    alex 1765.7750 2428.5429 2013.7843 1828.6055 294.37805     julie, tyler, sebastian, tim, theo

Another approach using dplyr is to use cross_join with rowwise() to get the distances, followed by slice_head(n=5, by=name_1) to get the five minimum distance by name_1, and then reframe or summarize the usual way:

cross_join(df_1, df_2) %>%
  rowwise() %>% 
  mutate(dist=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y))) %>% 
  ungroup() %>% 
  arrange(dist) %>%
  slice_head(n = 5, by=name_1) %>% 
  reframe(
    min_distance = min(dist),
    max_distance = max(dist),
    mean_distance=mean(dist),
    median_distance=median(dist),
    sd_distance = sd(dist),
    names = paste0(name_2, collapse=","),
    .by=name_1
  )

Output:

# A tibble: 15 × 7
   name_1  min_distance max_distance mean_distance median_distance sd_distance names                             
   <chr>          <dbl>        <dbl>         <dbl>           <dbl>       <dbl> <chr>                             
 1 taylor          171.         747.          470.            440.       227.  david,tim,nigel,sarah,sebastian   
 2 peter           195.        1455.          834.            830.       540.  steph,boris,matthew,anna,david    
 3 tom             244.         530.          426.            448.       110.  tim,sebastian,julie,nigel,david   
 4 ryan            343.        1244.          970.           1053.       367.  tyler,julie,sebastian,sylvia,nigel
 5 henry           395.         895.          647.            671.       237.  anna,matthew,david,steph,boris    
 6 john            423.        1949.         1106.           1053.       675.  boris,steph,matthew,anna,david    
 7 kelly           492.        1131.          718.            659.       249.  sylvia,tyler,sarah,nigel,julie    
 8 trevor          520.         651.          609.            632.        53.0 nigel,sarah,julie,tim,sebastian   
 9 troy            549.        1035.          783.            829.       221.  tyler,sylvia,sarah,nigel,theo     
10 michael         582.        1505.         1057.           1013.       379.  theo,tyler,sylvia,sarah,nigel     
11 david           603.         941.          752.            715.       159.  nigel,sarah,david,sylvia,anna     
12 kevin           639.         835.          716.            644.       102.  matthew,anna,david,nigel,steph    
13 xavier          973.        1767.         1370.           1397.       371.  julie,sebastian,tim,tyler,david   
14 chris          1389.        2107.         1644.           1456.       316.  julie,tyler,sebastian,tim,theo    
15 alex           1766.        2429.         2014.           1829.       294.  julie,tyler,sebastian,tim,theo   
like image 150
langtang Avatar answered Oct 30 '25 05:10

langtang


Work in progress

This solution is neither more concise nor faster than the one given by user @langtang, but draws attention to geosphere:.distm() as well as {Rfast}, {psych}, and {collpase}.

(1) Calculate matrix of distances (MD)

MD = geosphere::distm(df_1[-1], df_2[-1], fun = geosphere::distHaversine) 

(2) For each row (points from df_1), find first five nearest points (stored in matrix X)

rowMins = \(D, k) matrix(D[order(row(D), D)], ncol = ncol(D), byrow = TRUE)[, k]
X = rowMins(MD, 1:5)

(3) Calculate summary characteristics (S) row-wisely

Namely min, max, mean, median, and sd.

Although consired to be relatively slow, the trick is to transpose X first, such that we can use well-established functions. Unfortunately, summary(t(X)) lacks sd while collapse::qsut(t(X)) lacks median. Comment please if there are options.

S = cbind(collapse::qsu(t(X))[, -1], Median = Rfast::rowMedians(X))

This already has created overhead, since qsu() and rowMedians() both ran over each row. An alternative might be

psych::describe(t(X), skew = FALSE)[3:7] |> # S2
  `row.names<-`(df_1$name_1) 

giving

           mean     sd  median     min     max
john    1877.49 526.79 2086.03  965.66 2241.15
david    763.62 160.63  831.82  562.65  910.51
alex    1518.81 192.12 1561.73 1225.09 1721.88
kevin    892.37 290.81  922.51  582.00 1236.57
trevor   623.79 226.16  592.48  359.11  857.30
xavier   741.49 130.98  677.02  621.83  932.36
tom      530.70 189.60  597.44  205.96  663.98
michael 1109.90 146.69 1097.67  893.32 1295.68
troy     861.05 188.89  801.55  616.48 1059.15
kelly    802.43 291.93  800.38  432.64 1118.09
chris   1184.69 233.42 1233.05  840.16 1457.28
henry    963.14 257.45  994.43  649.05 1337.23
taylor   594.71 386.41  757.00  118.70 1003.30
ryan     720.59 217.00  772.56  407.34  957.74
peter   1333.79 552.69 1509.01  374.18 1718.11

There might be an option to specify which summary statistics should be calculated. I did not read the full documentation.

However, all this does not really help since you want the names associated with the five closest points too. A lot of overhead happens here.

X2 = t(apply(MD, 1, \(i) names(sort(i)[1:5])))
# collapse::dapply(MD, \(i) names(sort(i)[1:5])), 1) does not work 

Side note. It appears to me there is no better base R solution than:

f = \(X, k) t(apply(X, 1, \(i) names(sort(i)[1:k])))

Finally giving

> cbind(data.frame(S), data.frame(X2))
             Mean       SD       Min       Max    Median        X1        X2        X3        X4        X5
john    1879.5928 527.3832  966.7430 2243.6565 2088.3664     steph     tyler   malcolm     boris       tim
david    764.4733 160.8068  563.2785  911.5332  832.7494     steph     tyler sebastian       tim   malcolm
alex    1520.5110 192.3321 1226.4584 1723.8043 1563.4727      anna   matthew      theo    sylvia     david
kevin    893.3704 291.1345  582.6504 1237.9577  923.5420     tyler     steph       tim sebastian     boris
trevor   624.4905 226.4107  359.5081  858.2593  593.1462     david sebastian       tim     tyler   matthew
xavier   742.3244 131.1247  622.5262  933.4064  677.7796      anna   matthew    sylvia     sarah     julie
tom      531.2896 189.8111  206.1902  664.7259  598.1114       tim     tyler     julie     david sebastian
michael 1111.1394 146.8538  894.3224 1297.1271 1098.9009     nigel      theo   malcolm sebastian     david
troy     862.0110 189.0964  617.1660 1060.3376  802.4499 sebastian     david   malcolm     nigel      theo
kelly    803.3236 292.2534  433.1232 1119.3422  801.2745 sebastian     david   malcolm       tim      theo
chris   1186.0200 233.6779  841.1021 1458.9119 1234.4328      anna   matthew      theo     david     sarah
henry    964.2193 257.7372  649.7776 1338.7294  995.5426     tyler     steph       tim     boris sebastian
taylor   595.3773 386.8399  118.8290 1004.4187  757.8502     tyler       tim sebastian     julie     sarah
ryan     721.3972 217.2475  407.7949  958.8076  773.4229     david      theo      anna     nigel   matthew
peter   1335.2818 553.3045  374.5971 1720.0334 1510.6984     steph     tyler   malcolm sebastian       tim

Note

Data in reproducible format.

df_1 = structure(list(
  name_1 = c(
    "john",
    "david",
    "alex",
    "kevin",
    "trevor",
    "xavier",
    "tom",
    "michael",
    "troy",
    "kelly",
    "chris",
    "henry",
    "taylor",
    "ryan",
    "peter"
  ),
  lon = c(
    -74.0116047564655,
    -74.0083017748948,
    -73.9904129168585,
    -74.0052949160858,
    -74.0047071226484,
    -73.9888493501312,
    -74.0013908379401,
    -74.0186506123461,
    -74.0128685285189,
    -74.010456619701,
    -73.9937591820256,
    -74.0024018617294,
    -74.001992285494,
    -74.0048931728406,
    -74.0115584113475
  ),
  lat = c(
    40.730669131368,
    40.7177785047823,
    40.6931338284337,
    40.7198135590156,
    40.7080720859227,
    40.7021217629401,
    40.7106202508534,
    40.7025399555169,
    40.7055110877071,
    40.7065496073215,
    40.6959330668926,
    40.721177870445,
    40.7143337311784,
    40.7014186306299,
    40.7253381492107
  )
),
class = "data.frame",
row.names = c(NA, -15L))

df_2 = structure(list(
  name_2 = c(
    "matthew",
    "tyler",
    "sebastian",
    "julie",
    "anna",
    "tim",
    "david",
    "nigel",
    "sarah",
    "steph",
    "sylvia",
    "boris",
    "theo",
    "malcolm"
  ),
  lon = c(
    -73.9950316098685,
    -74.0016481850917,
    -74.0092593158553,
    -73.9945119238155,
    -73.9960649614404,
    -74.0005160304049,
    -74.0036126826489,
    -74.0122790607604,
    -73.9923934755147,
    -74.0120025958715,
    -73.9841266700698,
    -73.9906738937381,
    -74.008357003591,
    -74.0162642090031
  ),
  lat = c(
    40.705695934363,
    40.7153688370916,
    40.7103330812154,
    40.709324574006,
    40.7032838143273,
    40.7123497227519,
    40.7049509553054,
    40.6961205806341,
    40.7089977347971,
    40.7219899660906,
    40.7070465303739,
    40.7188796432223,
    40.6966211729171,
    40.7122443803448
  )
),
class = "data.frame",
row.names = c(NA, -14L))
like image 22
Friede Avatar answered Oct 30 '25 06:10

Friede



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!