diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/DIC/medi_dic.csv b/MedicalDictionary/KOREA.csv similarity index 100% rename from DIC/medi_dic.csv rename to MedicalDictionary/KOREA.csv diff --git a/R/DIC_COMPARE.R b/R/ExtractorFromDictionary.R similarity index 95% rename from R/DIC_COMPARE.R rename to R/ExtractorFromDictionary.R index d39f0e2..7936901 100644 --- a/R/DIC_COMPARE.R +++ b/R/ExtractorFromDictionary.R @@ -6,7 +6,7 @@ #' @export #' @examples #' DIC_COMPARE() -DIC_COMPARE <- function(doc.df){ +ExtractorFromDictionary <- function(doc.df){ dictionary <- diction() colnames(dictionary) <- c('word') @@ -72,7 +72,7 @@ DIC_COMPARE <- function(doc.df){ for(i in 1:nrow(doc.df)){ word <- strsplit(doc.df$'word'[i],' ')[[1]] - #영어는 따로 분리 + #????????? ?????? ?????? eng_word <- gsub('[^a-zA-Z]','',word) eng_word[length(eng_word)+1] <- c("") only_eng <- eng_word[-which(eng_word == "")] @@ -83,7 +83,7 @@ DIC_COMPARE <- function(doc.df){ diag_word_tmp_tmp_df <- data.frame('row_id' = rep(doc.df$row_id[i],length(diag_word)),'word' = diag_word,stringsAsFactors = F) - #rbind 나눠서 진행 + #rbind ????????? ?????? diag_word_tmp_df <- rbind(diag_word_tmp_df,diag_word_tmp_tmp_df) if(i %% 100 == 0){ diag_word_df <- rbind(diag_word_df,diag_word_tmp_df) diff --git a/R/NLP_PROCESSING_FUNCTION.R b/R/LanguagePreProcessingFunction.R similarity index 89% rename from R/NLP_PROCESSING_FUNCTION.R rename to R/LanguagePreProcessingFunction.R index 7b96255..7f330dc 100644 --- a/R/NLP_PROCESSING_FUNCTION.R +++ b/R/LanguagePreProcessingFunction.R @@ -6,14 +6,14 @@ #' @export #' @examples #' NLP_PROCESSING_FUNCTION() -NLP_PROCESSING_FUNCTION <- function(result_xml_df){ +LanguagePreProcessingFunction <- function(result_xml_df){ numCores <- parallel::detectCores() - 1 myCluster <- parallel::makeCluster(numCores) - search_df <- result_xml_df[result_xml_df$``=='현병력',] + search_df <- result_xml_df[result_xml_df$``=='?????????',] tag ='' diff --git a/R/NLP_PROCESSING.R b/R/NLP_PROCESSING.R index 96c2159..10e32e7 100644 --- a/R/NLP_PROCESSING.R +++ b/R/NLP_PROCESSING.R @@ -21,23 +21,23 @@ NLP_PROCESSING <- function(xmldf){ xmldf <- gsub("\\/","", xmldf) xmldf <- gsub("\\'"," ", xmldf) xmldf <- gsub('\\"'," ", xmldf) - xmldf <- gsub("[~!@#$><%≥=^&×*-:●★¤]"," ", xmldf) + xmldf <- gsub("[~!@#$><%???=^&??*-:????????]"," ", xmldf) - xmldf <- gsub('“', " ", xmldf) - xmldf <- gsub('”', " ", xmldf) - xmldf <- gsub('‘', " ", xmldf) - xmldf <- gsub('’', " ", xmldf) + xmldf <- gsub('???', " ", xmldf) + xmldf <- gsub('???', " ", xmldf) + xmldf <- gsub('???', " ", xmldf) + xmldf <- gsub('???', " ", xmldf) xmldf <-xmldf <- gsub(',', " ", xmldf) xmldf<- tolower(xmldf) - xmldf <- gsub('[ㅏ-ㅣ]*','',xmldf) - xmldf <- gsub('[ㄱ-ㅎ]*','',xmldf) + xmldf <- gsub('[???-???]*','',xmldf) + xmldf <- gsub('[???-???]*','',xmldf) - pos_start <- as.vector(gregexpr('[^가-힣 ]*[A-Za-z]+[^가-힣 ]*',xmldf)[[1]]) - pos_length <- as.vector(attr(gregexpr('[^가-힣 ]*[A-Za-z]+[^가-힣 ]*',xmldf)[[1]],'match.length')) + pos_start <- as.vector(gregexpr('[^???-??? ]*[A-Za-z]+[^???-??? ]*',xmldf)[[1]]) + pos_length <- as.vector(attr(gregexpr('[^???-??? ]*[A-Za-z]+[^???-??? ]*',xmldf)[[1]],'match.length')) pos_end <- pos_start+pos_length-1 word_data <- c() diff --git a/R/getTopicFromNoteSettings.R b/R/getTopicFromNoteSettings.R index 4763865..3fe6ea0 100644 --- a/R/getTopicFromNoteSettings.R +++ b/R/getTopicFromNoteSettings.R @@ -1,16 +1,18 @@ #' Custom createCoveriate Settings #' #' This function is Custom createCoveriate Settings. -#' @param connection,oracleTempSchema,cdmDatabaseSchema,cohortTable,cohortId,cdmVersion,rowIdField,covariateSettings,aggregated -#' @keywordsa createCovariateSetting -#' @export -#' @examples +#' @connection connection,oracleTempSchema,cdmDatabaseSchema,cohortTable,cohortId,cdmVersion,rowIdField,covariateSettings,aggregated +#' @oracleTempSchema createCovariateSetting +#' @cdmDatabaseSchema +#' @cohortTable +#' @cohortId +#' @cdmVersion +#' @rowIdField +#' @noteConceptId +#' @covariateSettings +#' @aggregated #' getTopicFromNoteSettings() -# load packages - - - # load packages if(!require(rJava)) { install.packages('rJava') @@ -42,17 +44,14 @@ library(text2vec) library(e1071) useSejongDic() - - getTopicFromNoteSettings <- function(connection, oracleTempSchema = NULL, cdmDatabaseSchema, cohortTable = "cohort", - #cohortId = -1, #cohortId 미지정 - cohortId = cohortId, # cohortId 지정 + cohortId = -1, cdmVersion = "5", rowIdField = "subject_id", - conceptId = conceptId, + noteConceptId = noteConceptIdSet, covariateSettings, aggregated = FALSE){ @@ -62,38 +61,28 @@ getTopicFromNoteSettings <- function(connection, } if (covariateSettings$useDictionary == TRUE){ # Some SQL to construct the covariate: - sql <- paste( - 'SELECT top 100 @row_id_field AS row_id,', - 'n.NOTE_TEXT AS covariate_id,', - '1 AS covariate_value', - 'FROM @cdm_database_schema.NOTE n', - 'JOIN @cohort_table c', - 'ON n.person_id = c.subject_id', - 'AND n.NOTE_DATE = c.COHORT_START_DATE', - 'WHERE NOTE_TYPE_CONCEPT_ID = @concept_id', - #cohord_id가 지정되었을 때 - 'AND cohort_definition_id = @cohort_id' - ) - #cohort_id가 지정되지 않았을 때 - #"{@cohort_id != -1} ? {AND cohort_definition_id = @cohort_id}" + sql <- 'SELECT @row_id_field AS row_id,n.NOTE_TEXT AS covariate_id, 1 AS covariate_value + FROM @cdm_database_schema.NOTE n + JOIN @cohort_table c + ON n.person_id = c.subject_id AND n.NOTE_DATE = c.COHORT_START_DATE + WHERE NOTE_TYPE_CONCEPT_ID = @note_concept_id + {@cohort_id != -1} ? {AND cohort_definition_id = @cohort_id}' sql <- SqlRender::renderSql(sql, cohort_table = cohortTable, cohort_id = cohortId, - concept_id = conceptId, + note_concept_id = noteConceptId, row_id_field = rowIdField, cdm_database_schema = cdmDatabaseSchema)$sql sql <- SqlRender::translateSql(sql, targetDialect = attr(connection, "dbms"))$sql - - # Retrieve the covariate: - covariates <- DatabaseConnector::querySql.ffdf(connection, sql) - - row_id <- covariates$ROW_ID - covariates_value <- covariates$COVARIATE_ID + rawCovariates <- DatabaseConnector::querySql.ffdf(connection, sql) + colnames(rawCovariates)<-tolower(colnames(rawCovariates)) + row_id <- rawCovariates$row_id + covariates_value <- rawCovariates$covariate_id - covariates <- WORD_LOAD(row_id,covariates_value) + covariates <- wordToCovariate(row_id,covariates_value,useDictionary) # Convert colum names to camelCase: colnames(covariates) <- SqlRender::snakeCaseToCamelCase(colnames(covariates)) @@ -101,29 +90,23 @@ getTopicFromNoteSettings <- function(connection, if(covariateSettings$useTextToVec == TRUE){ ##Text2Vec covariates <- covariates - covariateId.factor<-as.factor(covariates$covariateId) - covariateRef <- data.frame(covariateId = seq(levels(covariateId.factor)), covariateName = levels(covariateId.factor), - analysisId = 1, - conceptId = 0) + analysisId = note_concept_id, + conceptId = note_concept_id) covariateRef <- ff::as.ffdf(covariateRef) } if(covariateSettings$useTopicModeling == TRUE){ covariates.df<-data.frame(covariates) - covariates.df$rowId <- as.numeric(as.factor(covariates$rowId)) covariates.df$covariateId<-as.numeric(as.factor(covariates$covariateId)) - data <- Matrix::sparseMatrix(i=covariates.df$rowId, j=covariates.df$covariateId, x=covariates.df$covariateValue, dims=c(max(covariates.df$rowId), max(covariates.df$covariateId))) # edit this to max(map$newIds) - - colnames(data) <- unique(covariates.df$covariateId) ##Topic Modeling @@ -134,10 +117,7 @@ getTopicFromNoteSettings <- function(connection, doc_topic_distr_df <- data.frame(doc_topic_distr) - - - - #return 값이 row_id는 그대로/ id는 topic / value는 0.3 + #return ?????? row_id??? ?????????/ id??? topic / value??? 0.3 covariateRef <- data.frame(covariateId = seq(levels(covariateId.factor)), covariateName = levels(covariateId.factor), @@ -147,22 +127,16 @@ getTopicFromNoteSettings <- function(connection, } if(covariateSettings$useGloVe == TRUE){ - break + stop("useGloVe not supported currently") } if(covariateSettings$useAutoencoder == TRUE){ - break + stop("useAutoencoder not supported currently") } - - - - - - # Construct analysis reference: analysisRef <- data.frame(analysisId = 1, - analysisName = "Length of observation", - domainId = "Demographics", + analysisName = "Covariates from the Note", + domainId = "Note", startDay = 0, endDay = 0, isBinary = "N", diff --git a/R/WORD_LOAD.R b/R/wordToCovariate.R similarity index 55% rename from R/WORD_LOAD.R rename to R/wordToCovariate.R index 2626133..83dd143 100644 --- a/R/WORD_LOAD.R +++ b/R/wordToCovariate.R @@ -6,16 +6,15 @@ #' @export #' @examples #' WORD_LOAD() -WORD_LOAD <- function(rowid,covariatesvalue){ +wordToCovariate <- function(rowid,covariatesvalue,useDictionary){ - result_xml_df <- XML_PASING_FUNCTION(rowid,covariatesvalue) + result_xml_df <- NoteXmlParser(rowid,covariatesvalue) - doc.df <- NLP_PROCESSING_FUNCTION(result_xml_df) + if(useDictionary){result_xml_df <- LanguagePreProcessingFunction(result_xml_df)} - df <- DIC_COMPARE(doc.df) + df <- ExtractorFromDictionary(result_xml_df) df <- cbind(df,rep(1,nrow(df))) colnames(df) <- c('row_id','covariate_id','covariate_value') - return(df) }