library(MuSiC)
library(Biobase)
library(magrittr)
library(tibble)
library(dplyr)
library(tidyr)
library(knitr)
library(xlsx)
library(ggplot2)
library(openxlsx)
library(RColorBrewer)
library(clusterProfiler)
library(pheatmap)

## what is needed: mydataframe, coldata
coldata <- read.xlsx("Dataframe/mydataframe_all.xlsx")
coldata <- as.data.frame(coldata)
coldata <- subset.data.frame(x = coldata, subset = !coldata$expgroup == "THAL_GMP" )
coldata <- subset.data.frame(x = coldata, subset = !coldata$expgroup == "HD_GMP" )
coldata$sex <- c(rep("Male", 16), rep("Female", 8), rep("Male", 4), rep("Female", 11))
coldata$sex <- as.factor(coldata$sex)
coldata$time <- as.factor(coldata$time)
coldata$disease <- as.factor(coldata$disease)
coldata$name <- as.factor(coldata$name)

rownames(coldata) <- coldata$rownames

## CountData
countData <- read.table(file = "Full-gene-counts.txt", header = TRUE, stringsAsFactors = FALSE, row.names = 1)
countData <- countData[ , c(1:8,11:18,21:43)]
colnames(countData) <- coldata$name

coldata <- droplevels.data.frame(coldata)
coldata$time <- factor(coldata$time, levels = c("HSC","MPP","CMP","MEP", "R5","R6","R7","R8"))

## covert data.frame in matrix for expressionset
exprs_count <- as.matrix(countData)
minimalSet <- ExpressionSet(assayData = exprs_count)
pdata <- coldata
rownames(pdata) <- pdata$name
 
## verify the order of pdata and exprs count ##
all(rownames(pdata) == colnames(exprs_count)) ## MUST BE EQUAL 

metadata <- data.frame(labelDescription = 
                         c("rownames used" , "Patient name", "differentiation time",
                           "Thal/HD condition", "expgroup", "patient gender"),
                       row.names = c("rownames","name", "time", "disease", "expgroup", "sex"))

phenodata <- new("AnnotatedDataFrame", data = pdata, varMetadata = metadata)
rnaseqSet <- ExpressionSet(assayData = exprs_count,
                           phenoData = phenodata)

saveRDS(rnaseqSet, "rnaseqSet.rds")