This package provides whole-genome mappability tracks on human hg19/hg38 assembly. We employed the 100-mers mappability track from the ENCODE Project and computed weighted average of the mappability scores if multiple ENCODE regions overlap with the same bin.
library(WGSmapp)
data("mapp_hg19")
mapp_hg19
## GRanges object with 21591667 ranges and 1 metadata column:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr1 10001-10014 * | 0.0028
## [2] chr1 10015 * | 0.3333
## [3] chr1 10016-10026 * | 0.5
## [4] chr1 10027-10031 * | 1
## [5] chr1 10032-10036 * | 0.5
## ... ... ... ... . ...
## [21591663] chrY 59363020-59363314 * | 0.0028
## [21591664] chrY 59363315-59363317 * | 0.3333
## [21591665] chrY 59363318 * | 0.25
## [21591666] chrY 59363319-59363320 * | 0.3333
## [21591667] chrY 59363321-59363517 * | 0.5
## -------
## seqinfo: 25 sequences from an unspecified genome
For hg19 reference genome, “blacklist” bins, including segmental duplication regions and gaps in reference assembly from telomere, centromere, and/or heterochromatin regions are included.
library(WGSmapp)
# Get segmental duplication regions
seg.dup = read.table(system.file("extdata", "GRCh37GenomicSuperDup.tab", package = "WGSmapp"), head = TRUE)
# Get hg19 gaps
gaps = read.table(system.file("extdata", "hg19gaps.txt", package = "WGSmapp"), head = TRUE)
head(seg.dup)
## chrom chromStart chromEnd name score strand otherChrom otherStart
## 1 chr1 85326 87112 chr1:398212 0 _ chr1 398212
## 2 chr1 398212 400000 chr1:85326 0 _ chr1 85326
## 3 chr1 88000 121417 chr1:235525 0 + chr1 235525
## 4 chr1 235525 267707 chr1:88000 0 + chr1 88000
## 5 chr1 91256 92392 chr1:521369 0 + chr1 521369
## 6 chr1 521369 522487 chr1:91256 0 + chr1 91256
## otherEnd otherSize uid posBasesHit testResult verdict chits ccov
## 1 400000 1788 1 0 N/A N/A N/A N/A
## 2 87112 1786 1 0 N/A N/A N/A N/A
## 3 267707 32182 2 0 N/A N/A N/A N/A
## 4 121417 33417 2 0 N/A N/A N/A N/A
## 5 522487 1118 3 0 N/A N/A N/A N/A
## 6 92392 1136 3 0 N/A N/A N/A N/A
## alignfile alignL indelN indelS alignB matchB mismatchB
## 1 align_both/0012/both060568 1788 2 2 1786 1757 29
## 2 align_both/0012/both060568 1788 2 2 1786 1757 29
## 3 align_both/0012/both060569 33449 25 1299 32150 31941 209
## 4 align_both/0012/both060569 33449 25 1299 32150 31941 209
## 5 align_both/0012/both060581 1137 4 20 1117 1092 25
## 6 align_both/0012/both060581 1137 4 20 1117 1092 25
## transitionsB transversionsB fracMatch fracMatchIndel jcK
## 1 15 14 0.983763 0.982662 0.01641570
## 2 15 14 0.983763 0.982662 0.01641570
## 3 133 76 0.993499 0.992727 0.00652911
## 4 133 76 0.993499 0.992727 0.00652911
## 5 18 7 0.977619 0.974130 0.02272210
## 6 18 7 0.977619 0.974130 0.02272210
## k2K
## 1 0.01642270
## 2 0.01642270
## 3 0.00653207
## 4 0.00653207
## 5 0.02278150
## 6 0.02278150
head(gaps)
## bin chrom chromStart chromEnd ix n size type bridge
## 1 0 chr1 124535434 142535434 1271 N 18000000 heterochromatin no
## 2 23 chr1 121535434 124535434 1270 N 3000000 centromere no
## 3 76 chr1 3845268 3995268 47 N 150000 contig no
## 4 85 chr1 13219912 13319912 154 N 100000 contig no
## 5 89 chr1 17125658 17175658 196 N 50000 clone yes
## 6 101 chr1 29878082 30028082 337 N 150000 contig no
For hg38 reference genome, “blacklist” bins, including segmental duplication regions and gaps in reference assembly from telomere, centromere, and/or heterochromatin regions are also incorporated in the package.
library(WGSmapp)
# Get segmental duplication regions
seg.dup.hg38 = read.table(system.file("extdata", "GRCh38GenomicSuperDup.tab", package = "WGSmapp"))
# Get hg38 gaps
gaps.hg38 = read.table(system.file("extdata", "hg38gaps.txt", package = "WGSmapp"))
head(seg.dup.hg38)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 chr1 10169 37148 chr1:180723 0 + chr1 180723 207666 26943 1 1000
## 2 chr1 180723 207666 chr1:10169 0 + chr1 10169 37148 26979 1 1000
## 3 chr1 88000 121417 chr1:265774 0 + chr1 265774 297956 32182 2 1000
## 4 chr1 265774 297956 chr1:88000 0 + chr1 88000 121417 33417 2 1000
## 5 chr1 88000 92392 chr1:355156 0 + chr1 355156 358335 3179 3 1000
## 6 chr1 355156 358335 chr1:88000 0 + chr1 88000 92392 4392 3 1000
## V13 V14 V15 V16 V17 V18 V19 V20 V21 V22
## 1 N/A N/A N/A N/A align_both/0014/both0071547 27025 30 128 26897 26628
## 2 N/A N/A N/A N/A align_both/0014/both0071547 27025 30 128 26897 26628
## 3 N/A N/A N/A N/A align_both/0014/both0071548 33449 25 1299 32150 31941
## 4 N/A N/A N/A N/A align_both/0014/both0071548 33449 25 1299 32150 31941
## 5 N/A N/A N/A N/A align_both/0014/both0071549 4398 8 1225 3173 3104
## 6 N/A N/A N/A N/A align_both/0014/both0071549 4398 8 1225 3173 3104
## V23 V24 V25 V26 V27 V28 V29
## 1 269 164 105 0.9899989 0.9888959 0.010068396 0.010074269
## 2 269 164 105 0.9899989 0.9888959 0.010068396 0.010074269
## 3 209 133 76 0.9934992 0.9927273 0.006529115 0.006532073
## 4 209 133 76 0.9934992 0.9927273 0.006529115 0.006532073
## 5 69 46 23 0.9782540 0.9757938 0.022067470 0.022109061
## 6 69 46 23 0.9782540 0.9757938 0.022067470 0.022109061
head(gaps.hg38)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 585 chr1 0 10000 1 N 10000 telomere no
## 2 586 chr1 207666 257666 5 N 50000 contig no
## 3 587 chr1 297968 347968 7 N 50000 contig no
## 4 589 chr1 535988 585988 10 N 50000 contig no
## 5 605 chr1 2702781 2746290 48 N 43509 scaffold yes
## 6 85 chr1 12954384 13004384 224 N 50000 scaffold yes
The dataset consists of three assembled .bam files of single-cell whole genome sequencing from 10X Genomics Single-Cell CNV solution for illustration purposes. These three cells are from section E of five adjacent tumor dissections of a breast cancer patient. Corresponding cellular barcode tags are “AAAGCAATCTGACGCG”, “GCAGTTACACTGTATG”, and “CTCGTCACAGGTTAAA”.