Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to obtain normalized data per group using scale() in R?

For example, I have a data like below.

 dataA=structure(list(variety = c("CV1", "CV1", "CV1", "CV1", "CV1", 
                                     "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", 
                                     "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", 
                                     "CV1", "CV1", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", 
                                     "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", 
                                     "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", 
                                     "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", 
                                     "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", 
                                     "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV4", "CV4", 
                                     "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", 
                                     "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", 
                                     "CV4", "CV4", "CV4", "CV4", "CV4"), KN = c(3150, 2646, 3024, 
                                                                                3402, 3260.25, 3276, 3150, 3685.5, 3260.25, 3402, 4672, 4234, 
                                                                                4599, 4526, 4599, 4526, 4234, 4088, 3504, 3942, 3550, 3550, 3124, 
                                                                                4437.5, 3550, 4313.25, 4118, 4153.5, 4473, 3851.75, 4260, 4952.25, 
                                                                                5680, 5112, 5431.5, 3976, 5112, 4828, 4544, 5112, 4632.75, 5271.75, 
                                                                                3976, 4792.5, 4544, 4313.25, 4952.25, 4792.5, 4260, 4952.25, 
                                                                                3192.75, 2580, 2999.25, 2580, 3096, 2902.5, 2805.75, 2332.75, 
                                                                                2999.25, 2666, 3240, 3648, 3360, 3564, 3360, 3360, 3552, 3648, 
                                                                                3456, 4320, 3256, 3564, 2992, 3663, 3168, 3861, 3663, 3861, 3564, 
                                                                                3465, 2886, 3042, 2886, 2886, 3412.5, 2886, 3510, 3246.75, 3159, 
                                                                                3159, 3384, 3760, 4018.5, 3572, 3912.75, 4018.5, 4230, 3572, 
                                                                                3478, 3760), GY = c(9729.7744491255, 8562.20151523044, 9885.45084031151, 
                                                                                                    10741.6709918346, 9729.7744491255, 10118.9654270905, 9963.28903590451, 
                                                                                                    10975.1855786136, 10118.9654270905, 10041.1272314975, 12266.3112039261, 
                                                                                                    11454.5700213133, 12356.5046686608, 12627.0850628651, 13889.7935691516, 
                                                                                                    11995.7308097218, 11364.3765565786, 12085.9242744566, 11725.1504155176, 
                                                                                                    10552.6353739658, 10000.3548433298, 12368.8599378026, 10438.9668978618, 
                                                                                                    11316.1910069258, 10614.4117196746, 12632.0271705218, 13421.5288686794, 
                                                                                                    12456.582348709, 12368.8599378026, 12105.6927050834, 17281.3149485611, 
                                                                                                    17281.3149485611, 16404.090839497, 17719.9270030931, 18158.5390576251, 
                                                                                                    15877.7563740586, 19825.2648648467, 18333.9838794379, 17105.8701267483, 
                                                                                                    19123.4855775955, 14035.5857450242, 15351.4219086202, 14474.1977995562, 
                                                                                                    15702.3115522458, 15351.4219086202, 11930.2478832706, 15702.3115522458, 
                                                                                                    13860.1409232114, 13947.8633341178, 8859.96350154653, 8712.93579875975, 
                                                                                                    8075.40391104562, 8553.55282683122, 8394.16985490268, 9137.95705723583, 
                                                                                                    9403.59534378339, 8606.68048414073, 7969.1485964266, 8819.19111337877, 
                                                                                                    8606.68048414073, 11268.005457273, 13106.4695055649, 10437.7313709476, 
                                                                                                    10674.9525384691, 9844.67845214374, 10971.478997871, 11623.8372085553, 
                                                                                                    11327.3107491533, 10852.8684141103, 11149.3948735122, 11959.9005292108, 
                                                                                                    14134.4278981582, 11851.1741607634, 13101.5273979082, 12068.6268976581, 
                                                                                                    13373.3433190266, 12286.0796345529, 12394.8060030003, 12449.1691872239, 
                                                                                                    13427.7065032503, 11516.3463670221, 10745.3775725771, 11275.418618758, 
                                                                                                    11660.9030159805, 11805.4596649389, 11660.9030159805, 12287.3151614671, 
                                                                                                    12865.5417573008, 11564.5319166749, 12046.387413203, 14691.650536451, 
                                                                                                    14807.7900663834, 15504.6272459779, 14517.4412415523, 14923.9295963158, 
                                                                                                    14807.7900663834, 14923.9295963158, 13936.7435918902, 14459.3714765861, 
                                                                                                    14517.4412415523)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
                                                                                                                                                                                -100L))

Now, I'd like to normalize data per variety about KN and GY values respectively.

Simply, what I did is

CV1=subset(dataA, variety=="CV1")
CV2=subset(dataA, variety=="CV2")
CV3=subset(dataA, variety=="CV3")

CV1$CV1_KN=scale(CV1$KN, center=TRUE, scale=TRUE)
CV2$CV2_KN=scale(CV2$KN, center=TRUE, scale=TRUE)
CV3$CV3_KN=scale(CV3$KN, center=TRUE, scale=TRUE)
CV1$CV1_GY=scale(CV1$GY, center=TRUE, scale=TRUE)
CV2$CV2_GY=scale(CV2$GY, center=TRUE, scale=TRUE)
CV3$CV3_GY=scale(CV3$GY, center=TRUE, scale=TRUE)

Then I download each data and match it to each variety in an Excel file. But in my actual data, I have more than 30 varieties, and it's impossible to manually match. I believe there are simple ways to automatically obtain normalized data per group in R.

Could you let me know how to do it?

Always many thanks!!

like image 624
J.K Kim Avatar asked Feb 02 '26 21:02

J.K Kim


2 Answers

Here we will get a list of the different variety:

library(dplyr)

dataA_normalized <- dataA %>%
  group_by(variety) %>%
  mutate(KN_scaled = scale(KN, center = TRUE, scale = TRUE),
         GY_scaled = scale(GY, center = TRUE, scale = TRUE)) %>% 
  group_split()
<list_of<
  tbl_df<
    variety  : character
    KN       : double
    GY       : double
    KN_scaled: double[,1]
    GY_scaled: double[,1]
  >
>[4]>
[[1]]
# A tibble: 25 × 5
   variety    KN     GY KN_scaled[,1] GY_scaled[,1]
   <chr>   <dbl>  <dbl>         <dbl>         <dbl>
 1 CV1     3150   9730.       -0.980        -1.05  
 2 CV1     2646   8562.       -1.82         -2.01  
 3 CV1     3024   9885.       -1.19         -0.919 
 4 CV1     3402  10742.       -0.558        -0.211 
 5 CV1     3260.  9730.       -0.795        -1.05  
 6 CV1     3276  10119.       -0.769        -0.726 
 7 CV1     3150   9963.       -0.980        -0.854 
 8 CV1     3686. 10975.       -0.0839       -0.0180
 9 CV1     3260. 10119.       -0.795        -0.726 
10 CV1     3402  10041.       -0.558        -0.790 
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows

[[2]]
# A tibble: 25 × 5
   variety    KN     GY KN_scaled[,1] GY_scaled[,1]
   <chr>   <dbl>  <dbl>         <dbl>         <dbl>
 1 CV2     4313. 12632.        -0.703        -0.962
 2 CV2     4118  13422.        -1.10         -0.663
 3 CV2     4154. 12457.        -1.03         -1.03 
 4 CV2     4473  12369.        -0.376        -1.06 
 5 CV2     3852. 12106.        -1.65         -1.16 
 6 CV2     4260  17281.        -0.813         0.798
 7 CV2     4952. 17281.         0.607         0.798
 8 CV2     5680  16404.         2.10          0.466
 9 CV2     5112  17720.         0.935         0.964
10 CV2     5432. 18159.         1.59          1.13 
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows

[[3]]
# A tibble: 25 × 5
   variety    KN    GY KN_scaled[,1] GY_scaled[,1]
   <chr>   <dbl> <dbl>         <dbl>         <dbl>
 1 CV3     3193. 8713.       -0.0445        -0.978
 2 CV3     2580  8075.       -1.45          -1.34 
 3 CV3     2999. 8554.       -0.487         -1.07 
 4 CV3     2580  8394.       -1.45          -1.16 
 5 CV3     3096  9138.       -0.266         -0.736
 6 CV3     2902. 9404.       -0.708         -0.584
 7 CV3     2806. 8607.       -0.930         -1.04 
 8 CV3     2333. 7969.       -2.01          -1.40 
 9 CV3     2999. 8819.       -0.487         -0.918
10 CV3     2666  8607.       -1.25          -1.04 
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows

[[4]]
# A tibble: 25 × 5
   variety    KN     GY KN_scaled[,1] GY_scaled[,1]
   <chr>   <dbl>  <dbl>         <dbl>         <dbl>
 1 CV4     3861  13373.        0.949          0.162
 2 CV4     3663  12286.        0.446         -0.587
 3 CV4     3861  12395.        0.949         -0.512
 4 CV4     3564  12449.        0.194         -0.475
 5 CV4     3465  13428.       -0.0578         0.200
 6 CV4     2886  11516.       -1.53          -1.12 
 7 CV4     3042  10745.       -1.13          -1.65 
 8 CV4     2886  11275.       -1.53          -1.28 
 9 CV4     2886  11661.       -1.53          -1.02 
10 CV4     3412. 11805.       -0.191         -0.919
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows
like image 89
TarJae Avatar answered Feb 05 '26 12:02

TarJae


We may use across

library(dplyr)# version  >= 1.1.0
dataA %>% 
  mutate(across(c(KN, GY), ~ scale(.x, center = TRUE, scale = TRUE)[,1]
, .names = "{.col}_scaled"), .by = variety)
like image 27
akrun Avatar answered Feb 05 '26 11:02

akrun