如何从多个文本文件创建组合数据框并根据 R 中的文件名重命名列

穆罕默德·图菲克

我有多个 .txt 文件(每个文件包含 4 列;一个标识符基因列、一个 raw_counts 和其他列)。我想使用公共基因列将这些文件合并到一个组合数据框中。我能够将多个 .txt 文件一起导入,根据标识符列合并,但是,合并后,raw_counts 列变得无法识别(来自哪个 .txt 文件?)。是否可以将 .txt 文件部分的前缀/名称添加到此列并删除其余列(median_length_normalized 和 RPKM)。我提供了一个示例,可能有助于您提供输入。

File names: For instance
TARGET-00-BM3897-14A-01R.gene.quantification.txt
TARGET-00-BM3969-14A-01R.gene.quantification.txt

Add prefix names to the raw_counts column names as:
raw_counts_TARGET-00-BM3897-14A-01R
raw_counts_TARGET-00-BM3969-14A-01R

加载文件和合并:

library(tidyverse)
all_files <- dir("/Target_Data/TARGET_FHCRC/")
file_names <- grep(all_files,pattern = "^T.*gene.quantification.txt$",value = TRUE)
Data_file <- map(file_names,read.delim, stringsAsFactors = FALSE, check.names = FALSE, row.names = NULL)
str(Data_file)
Merge_All_Samples <- Data_file %>% reduce(inner_join, by = "gene")
colnames(Merge_All_Samples)
## There could be more for instance; ".x", ".x.x", ".x.x.x", , similary ".y", ".y.y", ".y.y.y"
names(Merge_All_Samples) = gsub(names(Merge_All_Samples),pattern = ".x",replacement = "")
names(Merge_All_Samples) = gsub(names(Merge_All_Samples),pattern = ".y",replacement = "")
colnames(Merge_All_Samples)

dput(Merge_All_Samples)

structure(list(gene = c("ENSG00000000457", "ENSG00000000460", 
                        "ENSG00000000938", "ENSG00000000971", "ENSG00000001460"), raw_counts = c(1359L, 
                                                                                                 1042L, 24113L, 296L, 351L), median_length_normalized = c(26.5127, 
                                                                                                                                                          12.2998, 513.195, 2.7302, 3.0886), RPKM = c(5.2943, 2.4561, 102.4798, 
                                                                                                                                                                                                      0.5452, 0.6167), raw_counts = c(1497L, 1152L, 20619L, 251L, 314L
                                                                                                                                                                                                      ), median_length_normalized = c(29.2034, 13.5943, 438.824, 2.3087, 
                                                                                                                                                                                                                                      2.7697), RPKM = c(6.9478, 3.2342, 104.4015, 0.5493, 0.6589)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                        -5L))
#>              gene raw_counts median_length_normalized     RPKM raw_counts
#> 1 ENSG00000000457       1359                  26.5127   5.2943       1497
#> 2 ENSG00000000460       1042                  12.2998   2.4561       1152
#> 3 ENSG00000000938      24113                 513.1950 102.4798      20619
#> 4 ENSG00000000971        296                   2.7302   0.5452        251
#> 5 ENSG00000001460        351                   3.0886   0.6167        314
#>   median_length_normalized     RPKM
#> 1                  29.2034   6.9478
#> 2                  13.5943   3.2342
#> 3                 438.8240 104.4015
#> 4                   2.3087   0.5493
#> 5                   2.7697   0.6589

## Drop columns "RPKM", "median_length_normalized" from all files (not necessary columns)
drop.cols <- c("RPKM", "median_length_normalized")
Merge_All_Samples <- Merge_All_Samples %>% select(-one_of(drop.cols))

预期输出:

dput(Output)
structure(list(gene = c("ENSG00000000457", "ENSG00000000460", 
                        "ENSG00000000938", "ENSG00000000971", "ENSG00000001460"), "raw_counts_TARGET-00-BM3897-14A-01R" = c(1359L, 
                                                                                                 1042L, 24113L, 296L, 351L), "raw_counts_TARGET-00-BM3969-14A-01R" = c(1497L, 1152L, 20619L, 
                                                                                                                                              251L, 314L)), class = "data.frame", row.names = c(NA, -5L))
#>              gene raw_counts_TARGET-00-BM3897-14A-01R
#> 1 ENSG00000000457                                1359
#> 2 ENSG00000000460                                1042
#> 3 ENSG00000000938                               24113
#> 4 ENSG00000000971                                 296
#> 5 ENSG00000001460                                 351
#>   raw_counts_TARGET-00-BM3969-14A-01R
#> 1                                1497
#> 2                                1152
#> 3                               20619
#> 4                                 251
#> 5                                 314

谢谢,

穆罕默德

罗纳克沙阿

你可以试试 -

library(tidyverse)
#List all the files
all_files <- dir("/Target_Data/TARGET_FHCRC/")

#include the files with the specific pattern
file_names <- grep(all_files,
                   pattern = "^T.*gene.quantification.txt$",value = TRUE)

#Read the file in a list with the name of the file as list name
Data_file <- sapply(file_names,read.delim, stringsAsFactors = FALSE, 
                    check.names = FALSE, row.names = NULL, simplify = FALSE)
#Drop .txt from name of the list
names(Data_file) <- sub('\\.txt$', '', names(Data_file))

#For each dataframe in list of dataframe append name of the file
Merge_All_Samples <- Data_file %>% 
                      imap(function(x, y) x %>% 
                            #Drop the columns that you don't need
                             select(-median_length_normalized, -RPKM) %>% 
                             rename_with(~paste0(., y), -gene)) %>%
                      reduce(inner_join, by = "gene")

Merge_All_Samples

本文收集自互联网,转载请注明来源。

如有侵权,请联系 [email protected] 删除。

编辑于
0

我来说两句

0 条评论
登录 后参与评论

相关文章