Abstract Classification using LACS framework

Here we show how to use LACS framework and our R packages to classify abstracts for a specific scientific literature topic. In this exercise we are going to model from GMPD and Zover database abstracts and classify GBIF abstracts.

library(tidyverse)
library(tidymodels)
library(text2vec)
library(ggplot2)

db_abstracts <-  readr::read_csv("https://raw.githubusercontent.com/alrobles/abstractsHostParasites/main/data-raw/df_abstracts.csv")
#View(db_abstracts)
db_abstracts <- db_abstracts %>%
  mutate(Label.obs = ifelse(class == "parasite", 1, 0))
train_test_split <- initial_split(db_abstracts, prop = 0.65)
db_abstracts_train <- training(train_test_split)
db_abstracts_test <- testing(train_test_split)
db_abstracts_train %>% count(class)
db_abstracts_test %>% count(class)


#db_abstracts <- db_abstracts %>% 
#mutate(language = textcat(search_abstract))


prep_fun = function(x) {
  # make text lower case
  x = str_to_lower(x)
  # remove only numbers
  x = str_replace_all(x, "^\\d+", " ")
  # remove non-alphanumeric symbols
  x = str_replace_all(x, "[^[:alnum:]]", " ")
  # collapse multiple spaces
  x = str_replace_all(x, "\\s+", " ")
  # remove english
  ifelse(stri_enc_isascii(x), x, str_replace_all(x, "." , "") )
}

it_train = itoken(db_abstracts_train$abstract,
                  preprocessor = tolower, 
                  tok_fun = word_tokenizer,
                  progressbar = FALSE)

it_test = itoken(db_abstracts_test$abstract,
                 preprocessor = tolower, 
                 tok_fun = word_tokenizer,
                 progressbar = FALSE)
library(stringi)

v <-  create_vocabulary(it_train, 
                        ngram = c(1L, 4L), 
                        stopwords = stopwords::stopwords())

v <- v %>% dplyr::filter(!grepl(pattern = "^[0-9]", term ))
v <- v %>% dplyr::filter(!grepl(pattern = "[0-9]$", term ))
# v <- v %>% dplyr::filter(!grepl(pattern = "^[0-9]", .data$term ))
v <- v %>% dplyr::filter(grepl(pattern = "^[a-z]", .data$term ))
v <- v %>% dplyr::filter(stringi::stri_enc_isascii(term))
v <- v %>% dplyr::filter(nchar(term) >=  4)
v <- v %>% dplyr::filter(!grepl(pattern = "^[0-9]", term ))
#vectorizer <-  vocab_vectorizer(v)

pruned_vocab = prune_vocabulary(v, 
                                term_count_min = 20
                                #,doc_proportion_max = 0.1
                                #,doc_proportion_min = 0.01
)
#pruned_vocab_pulearning <- pruned_vocab

#write_rds(pruned_vocab_pulearning, "pruned_vocab_pulearning.rds")

vectorizer = vocab_vectorizer(pruned_vocab)
dtm_train <-  create_dtm(it_train, vectorizer)
dtm_test <-  create_dtm(it_test, vectorizer)
tfidf = TfIdf$new()
dtm_df_tfidf_train <- fit_transform(dtm_train, tfidf)
dtm_df_tfidf_test <- fit_transform(dtm_test, tfidf)

dtm_df_train <- as.matrix(dtm_df_tfidf_train) %>% as_tibble() 
dtm_df_test <- as.matrix(dtm_df_tfidf_test) %>% as_tibble() 

dtm_df_train$class <-  db_abstracts_train[['class']]
dtm_df_test$class <-  db_abstracts_test[['class']]

# Split test/training sets
#library(tidymodels)
# source("PLUR.R")
# library(glmnet)

Prediction <- 
  PLUS(train_data = dtm_df_tfidf_train,
       Label.obs = db_abstracts_train[["Label.obs"]],
       Sample_use_time = 100,
       l.rate = 1, qq = 0.15)

Coef_Matrix <- Prediction$coef1 %>% as.matrix()


yhat_pulearning_prob_vec <- predict(Prediction$fit.pi, 
                                    s = 'lambda.min', 
                                    newx=dtm_df_tfidf_test, 
                                    type="response") %>% as.numeric()

yhat_pulearning_class_vec <- predict(Prediction$fit.pi, 
                                     s = 'lambda.min', 
                                     newx=dtm_df_tfidf_test, 
                                     type="class") %>% as.numeric()

# Format test data and predictions for yardstick metrics
estimates_pulearning_tbl <- tibble(
  truth      = factor(db_abstracts_test[["Label.obs"]], levels = c(0, 1)),
  estimate   = factor(yhat_pulearning_class_vec),
  class_prob = yhat_pulearning_prob_vec
)


estimates_pulearning_tbl %>% yardstick::conf_mat(truth, estimate)


estimates_pulearning_tbl %>% arrange(class_prob) %>% 
  mutate(id = row_number()) %>% 
  ggplot() + geom_point(aes(id, class_prob))
#estimates_keras_tbl %>% conf_mat(truth, estimate)

estimates_pulearning_tbl %>%
  yardstick::f_meas(truth, estimate, beta = 1)

estimates_pulearning_tbl %>%
  arrange(class_prob) %>% 
  pROC::roc_(response = "truth", predictor = "class_prob") %>% 
  plot()

pROC::auc(estimates_pulearning_tbl$truth, estimates_pulearning_tbl$class_prob,
          partial.auc=c(1, .8), partial.auc.focus="se",
          partial.auc.correct = TRUE)
# glmnet_classifier_pulearning <- Prediction$fit.pi
# write_rds(glmnet_classifier_pulearning, "glmnet_classifier_pulearning_7_9_22.rds")

Except where otherwise noted, content on this site is licensed under the CC-BY license.