Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
File renamed without changes.
6 changes: 3 additions & 3 deletions R/DIC_COMPARE.R → R/ExtractorFromDictionary.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' @export
#' @examples
#' DIC_COMPARE()
DIC_COMPARE <- function(doc.df){
ExtractorFromDictionary <- function(doc.df){

dictionary <- diction()
colnames(dictionary) <- c('word')
Expand Down Expand Up @@ -72,7 +72,7 @@ DIC_COMPARE <- function(doc.df){
for(i in 1:nrow(doc.df)){
word <- strsplit(doc.df$'word'[i],' ')[[1]]

#영어는 따로 분리
#????????? ?????? ??????
eng_word <- gsub('[^a-zA-Z]','',word)
eng_word[length(eng_word)+1] <- c("")
only_eng <- eng_word[-which(eng_word == "")]
Expand All @@ -83,7 +83,7 @@ DIC_COMPARE <- function(doc.df){

diag_word_tmp_tmp_df <- data.frame('row_id' = rep(doc.df$row_id[i],length(diag_word)),'word' = diag_word,stringsAsFactors = F)

#rbind 나눠서 진행
#rbind ????????? ??????
diag_word_tmp_df <- rbind(diag_word_tmp_df,diag_word_tmp_tmp_df)
if(i %% 100 == 0){
diag_word_df <- rbind(diag_word_df,diag_word_tmp_df)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
#' @export
#' @examples
#' NLP_PROCESSING_FUNCTION()
NLP_PROCESSING_FUNCTION <- function(result_xml_df){
LanguagePreProcessingFunction <- function(result_xml_df){


numCores <- parallel::detectCores() - 1

myCluster <- parallel::makeCluster(numCores)

search_df <- result_xml_df[result_xml_df$`<MN>`=='현병력',]
search_df <- result_xml_df[result_xml_df$`<MN>`=='?????????',]

tag ='<TD>'

Expand Down
18 changes: 9 additions & 9 deletions R/NLP_PROCESSING.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,23 @@ NLP_PROCESSING <- function(xmldf){
xmldf <- gsub("\\/","", xmldf)
xmldf <- gsub("\\'"," ", xmldf)
xmldf <- gsub('\\"'," ", xmldf)
xmldf <- gsub("[~!@#$><%=^&×*-:●★¤]"," ", xmldf)
xmldf <- gsub("[~!@#$><%???=^&??*-:????????]"," ", xmldf)

xmldf <- gsub('', " ", xmldf)
xmldf <- gsub('', " ", xmldf)
xmldf <- gsub('', " ", xmldf)
xmldf <- gsub('', " ", xmldf)
xmldf <- gsub('???', " ", xmldf)
xmldf <- gsub('???', " ", xmldf)
xmldf <- gsub('???', " ", xmldf)
xmldf <- gsub('???', " ", xmldf)

xmldf <-xmldf <- gsub(',', " ", xmldf)

xmldf<- tolower(xmldf)

xmldf <- gsub('[ㅏ-ㅣ]*','',xmldf)
xmldf <- gsub('[ㄱ-ㅎ]*','',xmldf)
xmldf <- gsub('[???-???]*','',xmldf)
xmldf <- gsub('[???-???]*','',xmldf)


pos_start <- as.vector(gregexpr('[^가-힣 ]*[A-Za-z]+[^가-힣 ]*',xmldf)[[1]])
pos_length <- as.vector(attr(gregexpr('[^가-힣 ]*[A-Za-z]+[^가-힣 ]*',xmldf)[[1]],'match.length'))
pos_start <- as.vector(gregexpr('[^???-??? ]*[A-Za-z]+[^???-??? ]*',xmldf)[[1]])
pos_length <- as.vector(attr(gregexpr('[^???-??? ]*[A-Za-z]+[^???-??? ]*',xmldf)[[1]],'match.length'))
pos_end <- pos_start+pos_length-1

word_data <- c()
Expand Down
88 changes: 31 additions & 57 deletions R/getTopicFromNoteSettings.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#' Custom createCoveriate Settings
#'
#' This function is Custom createCoveriate Settings.
#' @param connection,oracleTempSchema,cdmDatabaseSchema,cohortTable,cohortId,cdmVersion,rowIdField,covariateSettings,aggregated
#' @keywordsa createCovariateSetting
#' @export
#' @examples
#' @connection connection,oracleTempSchema,cdmDatabaseSchema,cohortTable,cohortId,cdmVersion,rowIdField,covariateSettings,aggregated
#' @oracleTempSchema createCovariateSetting
#' @cdmDatabaseSchema
#' @cohortTable
#' @cohortId
#' @cdmVersion
#' @rowIdField
#' @noteConceptId
#' @covariateSettings
#' @aggregated
#' getTopicFromNoteSettings()

# load packages



# load packages
if(!require(rJava)) {
install.packages('rJava')
Expand Down Expand Up @@ -42,17 +44,14 @@ library(text2vec)
library(e1071)
useSejongDic()



getTopicFromNoteSettings <- function(connection,
oracleTempSchema = NULL,
cdmDatabaseSchema,
cohortTable = "cohort",
#cohortId = -1, #cohortId 미지정
cohortId = cohortId, # cohortId 지정
cohortId = -1,
cdmVersion = "5",
rowIdField = "subject_id",
conceptId = conceptId,
noteConceptId = noteConceptIdSet,
covariateSettings,
aggregated = FALSE){

Expand All @@ -62,68 +61,52 @@ getTopicFromNoteSettings <- function(connection,
}
if (covariateSettings$useDictionary == TRUE){
# Some SQL to construct the covariate:
sql <- paste(
'SELECT top 100 @row_id_field AS row_id,',
'n.NOTE_TEXT AS covariate_id,',
'1 AS covariate_value',
'FROM @cdm_database_schema.NOTE n',
'JOIN @cohort_table c',
'ON n.person_id = c.subject_id',
'AND n.NOTE_DATE = c.COHORT_START_DATE',
'WHERE NOTE_TYPE_CONCEPT_ID = @concept_id',
#cohord_id가 지정되었을 때
'AND cohort_definition_id = @cohort_id'
)
#cohort_id가 지정되지 않았을 때
#"{@cohort_id != -1} ? {AND cohort_definition_id = @cohort_id}"
sql <- 'SELECT @row_id_field AS row_id,n.NOTE_TEXT AS covariate_id, 1 AS covariate_value
FROM @cdm_database_schema.NOTE n
JOIN @cohort_table c
ON n.person_id = c.subject_id AND n.NOTE_DATE = c.COHORT_START_DATE
WHERE NOTE_TYPE_CONCEPT_ID = @note_concept_id
{@cohort_id != -1} ? {AND cohort_definition_id = @cohort_id}'

sql <- SqlRender::renderSql(sql,
cohort_table = cohortTable,
cohort_id = cohortId,
concept_id = conceptId,
note_concept_id = noteConceptId,
row_id_field = rowIdField,
cdm_database_schema = cdmDatabaseSchema)$sql
sql <- SqlRender::translateSql(sql, targetDialect = attr(connection, "dbms"))$sql



# Retrieve the covariate:
covariates <- DatabaseConnector::querySql.ffdf(connection, sql)

row_id <- covariates$ROW_ID
covariates_value <- covariates$COVARIATE_ID
rawCovariates <- DatabaseConnector::querySql.ffdf(connection, sql)
colnames(rawCovariates)<-tolower(colnames(rawCovariates))
row_id <- rawCovariates$row_id
covariates_value <- rawCovariates$covariate_id

covariates <- WORD_LOAD(row_id,covariates_value)
covariates <- wordToCovariate(row_id,covariates_value,useDictionary)

# Convert colum names to camelCase:
colnames(covariates) <- SqlRender::snakeCaseToCamelCase(colnames(covariates))

if(covariateSettings$useTextToVec == TRUE){
##Text2Vec
covariates <- covariates

covariateId.factor<-as.factor(covariates$covariateId)

covariateRef <- data.frame(covariateId = seq(levels(covariateId.factor)),
covariateName = levels(covariateId.factor),
analysisId = 1,
conceptId = 0)
analysisId = note_concept_id,
conceptId = note_concept_id)
covariateRef <- ff::as.ffdf(covariateRef)
}

if(covariateSettings$useTopicModeling == TRUE){

covariates.df<-data.frame(covariates)

covariates.df$rowId <- as.numeric(as.factor(covariates$rowId))
covariates.df$covariateId<-as.numeric(as.factor(covariates$covariateId))

data <- Matrix::sparseMatrix(i=covariates.df$rowId,
j=covariates.df$covariateId,
x=covariates.df$covariateValue,
dims=c(max(covariates.df$rowId), max(covariates.df$covariateId))) # edit this to max(map$newIds)


colnames(data) <- unique(covariates.df$covariateId)

##Topic Modeling
Expand All @@ -134,10 +117,7 @@ getTopicFromNoteSettings <- function(connection,

doc_topic_distr_df <- data.frame(doc_topic_distr)




#return 값이 row_id는 그대로/ id는 topic / value는 0.3
#return ?????? row_id??? ?????????/ id??? topic / value??? 0.3

covariateRef <- data.frame(covariateId = seq(levels(covariateId.factor)),
covariateName = levels(covariateId.factor),
Expand All @@ -147,22 +127,16 @@ getTopicFromNoteSettings <- function(connection,
}

if(covariateSettings$useGloVe == TRUE){
break
stop("useGloVe not supported currently")
}

if(covariateSettings$useAutoencoder == TRUE){
break
stop("useAutoencoder not supported currently")
}






# Construct analysis reference:
analysisRef <- data.frame(analysisId = 1,
analysisName = "Length of observation",
domainId = "Demographics",
analysisName = "Covariates from the Note",
domainId = "Note",
startDay = 0,
endDay = 0,
isBinary = "N",
Expand Down
9 changes: 4 additions & 5 deletions R/WORD_LOAD.R → R/wordToCovariate.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
#' @export
#' @examples
#' WORD_LOAD()
WORD_LOAD <- function(rowid,covariatesvalue){
wordToCovariate <- function(rowid,covariatesvalue,useDictionary){

result_xml_df <- XML_PASING_FUNCTION(rowid,covariatesvalue)
result_xml_df <- NoteXmlParser(rowid,covariatesvalue)

doc.df <- NLP_PROCESSING_FUNCTION(result_xml_df)
if(useDictionary){result_xml_df <- LanguagePreProcessingFunction(result_xml_df)}

df <- DIC_COMPARE(doc.df)
df <- ExtractorFromDictionary(result_xml_df)
df <- cbind(df,rep(1,nrow(df)))
colnames(df) <- c('row_id','covariate_id','covariate_value')


return(df)
}