parkdongsu · chandryou · Sep 29, 2018
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/DIC/medi_dic.csv → MedicalDictionary/KOREA.csv b/DIC/medi_dic.csv → MedicalDictionary/KOREA.csv
diff --git a/R/DIC_COMPARE.R → R/ExtractorFromDictionary.R b/R/DIC_COMPARE.R → R/ExtractorFromDictionary.R
@@ -6,7 +6,7 @@
 #' @export
 #' @examples
 #' DIC_COMPARE()
-DIC_COMPARE <- function(doc.df){
+ExtractorFromDictionary <- function(doc.df){
 
     dictionary <- diction()
     colnames(dictionary) <- c('word')
@@ -72,7 +72,7 @@ DIC_COMPARE <- function(doc.df){
     for(i in 1:nrow(doc.df)){
         word <- strsplit(doc.df$'word'[i],' ')[[1]]
 
-        #영어는 따로 분리
+        #????????? ?????? ??????
         eng_word <- gsub('[^a-zA-Z]','',word)
         eng_word[length(eng_word)+1] <- c("")
         only_eng <- eng_word[-which(eng_word == "")]
@@ -83,7 +83,7 @@ DIC_COMPARE <- function(doc.df){
 
         diag_word_tmp_tmp_df <- data.frame('row_id' = rep(doc.df$row_id[i],length(diag_word)),'word' = diag_word,stringsAsFactors = F)
 
-        #rbind 나눠서 진행
+        #rbind ????????? ??????
         diag_word_tmp_df <- rbind(diag_word_tmp_df,diag_word_tmp_tmp_df)
         if(i %% 100 == 0){
             diag_word_df <- rbind(diag_word_df,diag_word_tmp_df)

diff --git a/R/NLP_PROCESSING_FUNCTION.R → R/LanguagePreProcessingFunction.R b/R/NLP_PROCESSING_FUNCTION.R → R/LanguagePreProcessingFunction.R
@@ -6,14 +6,14 @@
 #' @export
 #' @examples
 #' NLP_PROCESSING_FUNCTION()
-NLP_PROCESSING_FUNCTION <- function(result_xml_df){
+LanguagePreProcessingFunction <- function(result_xml_df){
 
 
     numCores <- parallel::detectCores() - 1
 
     myCluster <- parallel::makeCluster(numCores)
 
-    search_df <- result_xml_df[result_xml_df$`<MN>`=='현병력',]
+    search_df <- result_xml_df[result_xml_df$`<MN>`=='?????????',]
 
     tag ='<TD>'
 

diff --git a/R/NLP_PROCESSING.R b/R/NLP_PROCESSING.R
@@ -21,23 +21,23 @@ NLP_PROCESSING <- function(xmldf){
     xmldf <- gsub("\\/","", xmldf)
     xmldf <- gsub("\\'"," ", xmldf)
     xmldf <- gsub('\\"'," ", xmldf)
-    xmldf <- gsub("[~!@#$><%≥=^&×*-:●★¤]"," ", xmldf)
+    xmldf <- gsub("[~!@#$><%???=^&??*-:????????]"," ", xmldf)
 
-    xmldf <- gsub('“', " ", xmldf)
-    xmldf <- gsub('”', " ", xmldf)
-    xmldf <- gsub('‘', " ", xmldf)
-    xmldf <- gsub('’', " ", xmldf)
+    xmldf <- gsub('???', " ", xmldf)
+    xmldf <- gsub('???', " ", xmldf)
+    xmldf <- gsub('???', " ", xmldf)
+    xmldf <- gsub('???', " ", xmldf)
 
     xmldf <-xmldf <- gsub(',', " ", xmldf)
 
     xmldf<- tolower(xmldf)
 
-    xmldf <- gsub('[ㅏ-ㅣ]*','',xmldf)
-    xmldf <- gsub('[ㄱ-ㅎ]*','',xmldf)
+    xmldf <- gsub('[???-???]*','',xmldf)
+    xmldf <- gsub('[???-???]*','',xmldf)
 
 
-    pos_start <- as.vector(gregexpr('[^가-힣 ]*[A-Za-z]+[^가-힣 ]*',xmldf)[[1]])
-    pos_length <- as.vector(attr(gregexpr('[^가-힣 ]*[A-Za-z]+[^가-힣 ]*',xmldf)[[1]],'match.length'))
+    pos_start <- as.vector(gregexpr('[^???-??? ]*[A-Za-z]+[^???-??? ]*',xmldf)[[1]])
+    pos_length <- as.vector(attr(gregexpr('[^???-??? ]*[A-Za-z]+[^???-??? ]*',xmldf)[[1]],'match.length'))
     pos_end <- pos_start+pos_length-1
 
     word_data <- c()

diff --git a/R/getTopicFromNoteSettings.R b/R/getTopicFromNoteSettings.R
@@ -1,16 +1,18 @@
 #' Custom createCoveriate Settings
 #'
 #' This function is Custom createCoveriate Settings.
-#' @param connection,oracleTempSchema,cdmDatabaseSchema,cohortTable,cohortId,cdmVersion,rowIdField,covariateSettings,aggregated
-#' @keywordsa createCovariateSetting
-#' @export
-#' @examples
+#' @connection connection,oracleTempSchema,cdmDatabaseSchema,cohortTable,cohortId,cdmVersion,rowIdField,covariateSettings,aggregated
+#' @oracleTempSchema createCovariateSetting
+#' @cdmDatabaseSchema
+#' @cohortTable
+#' @cohortId
+#' @cdmVersion
+#' @rowIdField
+#' @noteConceptId
+#' @covariateSettings
+#' @aggregated
 #' getTopicFromNoteSettings()
 
-# load packages
-
-
-
 # load packages
 if(!require(rJava)) {
     install.packages('rJava')
@@ -42,17 +44,14 @@ library(text2vec)
 library(e1071)
 useSejongDic()
 
-
-
 getTopicFromNoteSettings <- function(connection,
                                      oracleTempSchema = NULL,
                                      cdmDatabaseSchema,
                                      cohortTable = "cohort",
-                                     #cohortId = -1, #cohortId 미지정
-                                     cohortId = cohortId, # cohortId 지정
+                                     cohortId = -1,
                                      cdmVersion = "5",
                                      rowIdField = "subject_id",
-                                     conceptId = conceptId,
+                                     noteConceptId = noteConceptIdSet,
                                      covariateSettings,
                                      aggregated = FALSE){
 
@@ -62,68 +61,52 @@ getTopicFromNoteSettings <- function(connection,
     }
     if (covariateSettings$useDictionary == TRUE){
         # Some SQL to construct the covariate:
-        sql <- paste(
-            'SELECT top 100 @row_id_field AS row_id,',
-            'n.NOTE_TEXT AS covariate_id,',
-            '1 AS covariate_value',
-            'FROM @cdm_database_schema.NOTE n',
-            'JOIN @cohort_table c',
-            'ON n.person_id = c.subject_id',
-            'AND n.NOTE_DATE = c.COHORT_START_DATE',
-            'WHERE NOTE_TYPE_CONCEPT_ID = @concept_id',
-            #cohord_id가 지정되었을 때
-            'AND cohort_definition_id = @cohort_id'
-            )
-        #cohort_id가 지정되지 않았을 때
-        #"{@cohort_id != -1} ? {AND cohort_definition_id = @cohort_id}"
+        sql <- 'SELECT @row_id_field AS row_id,n.NOTE_TEXT AS covariate_id, 1 AS covariate_value
+                FROM @cdm_database_schema.NOTE n
+                JOIN @cohort_table c
+                ON n.person_id = c.subject_id AND n.NOTE_DATE = c.COHORT_START_DATE
+                WHERE NOTE_TYPE_CONCEPT_ID = @note_concept_id
+                {@cohort_id != -1} ? {AND cohort_definition_id = @cohort_id}'
 
         sql <- SqlRender::renderSql(sql,
                                     cohort_table = cohortTable,
                                     cohort_id = cohortId,
-                                    concept_id = conceptId,
+                                    note_concept_id = noteConceptId,
                                     row_id_field = rowIdField,
                                     cdm_database_schema = cdmDatabaseSchema)$sql
         sql <- SqlRender::translateSql(sql, targetDialect = attr(connection, "dbms"))$sql
 
-
-
         # Retrieve the covariate:
-        covariates <- DatabaseConnector::querySql.ffdf(connection, sql)
-
-        row_id              <-  covariates$ROW_ID
-        covariates_value    <- covariates$COVARIATE_ID
+        rawCovariates <- DatabaseConnector::querySql.ffdf(connection, sql)
+        colnames(rawCovariates)<-tolower(colnames(rawCovariates))
+        row_id              <-  rawCovariates$row_id
+        covariates_value    <- rawCovariates$covariate_id
 
-        covariates <- WORD_LOAD(row_id,covariates_value)
+        covariates <- wordToCovariate(row_id,covariates_value,useDictionary)
 
         # Convert colum names to camelCase:
         colnames(covariates) <- SqlRender::snakeCaseToCamelCase(colnames(covariates))
 
         if(covariateSettings$useTextToVec == TRUE){
             ##Text2Vec
             covariates <- covariates
-
             covariateId.factor<-as.factor(covariates$covariateId)
-
             covariateRef  <- data.frame(covariateId = seq(levels(covariateId.factor)),
                                         covariateName = levels(covariateId.factor),
-                                        analysisId = 1,
-                                        conceptId = 0)
+                                        analysisId = note_concept_id,
+                                        conceptId = note_concept_id)
             covariateRef <- ff::as.ffdf(covariateRef)
         }
 
         if(covariateSettings$useTopicModeling == TRUE){
 
             covariates.df<-data.frame(covariates)
-
             covariates.df$rowId <- as.numeric(as.factor(covariates$rowId))
             covariates.df$covariateId<-as.numeric(as.factor(covariates$covariateId))
-
             data <- Matrix::sparseMatrix(i=covariates.df$rowId,
                                          j=covariates.df$covariateId,
                                          x=covariates.df$covariateValue,
                                          dims=c(max(covariates.df$rowId), max(covariates.df$covariateId))) # edit this to max(map$newIds)
-
-
             colnames(data) <- unique(covariates.df$covariateId)
 
             ##Topic Modeling
@@ -134,10 +117,7 @@ getTopicFromNoteSettings <- function(connection,
 
             doc_topic_distr_df <- data.frame(doc_topic_distr)
 
-
-
-
-            #return 값이 row_id는 그대로/   id는 topic   /    value는 0.3
+            #return ?????? row_id??? ?????????/   id??? topic   /    value??? 0.3
 
             covariateRef  <- data.frame(covariateId = seq(levels(covariateId.factor)),
                                         covariateName = levels(covariateId.factor),
@@ -147,22 +127,16 @@ getTopicFromNoteSettings <- function(connection,
         }
 
         if(covariateSettings$useGloVe == TRUE){
-            break
+            stop("useGloVe not supported currently")
         }
 
         if(covariateSettings$useAutoencoder == TRUE){
-            break
+            stop("useAutoencoder not supported currently")
         }
-
-
-
-
-
-
         # Construct analysis reference:
         analysisRef <- data.frame(analysisId = 1,
-                                  analysisName = "Length of observation",
-                                  domainId = "Demographics",
+                                  analysisName = "Covariates from the Note",
+                                  domainId = "Note",
                                   startDay = 0,
                                   endDay = 0,
                                   isBinary = "N",

diff --git a/R/WORD_LOAD.R → R/wordToCovariate.R b/R/WORD_LOAD.R → R/wordToCovariate.R
@@ -6,16 +6,15 @@
 #' @export
 #' @examples
 #' WORD_LOAD()
-WORD_LOAD <- function(rowid,covariatesvalue){
+wordToCovariate <- function(rowid,covariatesvalue,useDictionary){
 
-    result_xml_df <- XML_PASING_FUNCTION(rowid,covariatesvalue)
+    result_xml_df <- NoteXmlParser(rowid,covariatesvalue)
 
-    doc.df <- NLP_PROCESSING_FUNCTION(result_xml_df)
+    if(useDictionary){result_xml_df <- LanguagePreProcessingFunction(result_xml_df)}
 
-    df <- DIC_COMPARE(doc.df)
+    df <- ExtractorFromDictionary(result_xml_df)
     df <- cbind(df,rep(1,nrow(df)))
     colnames(df) <- c('row_id','covariate_id','covariate_value')
 
-
     return(df)
 }