---
title: "Text Preprocessing and Entity Extraction"
author: "Chao Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Text Preprocessing and Entity Extraction}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
```

# Text Preprocessing and Entity Extraction

This vignette explains the text preprocessing and entity extraction capabilities of the `LBDiscover` package, which are fundamental steps in the literature-based discovery process.

## Introduction

Before applying discovery models, we need to preprocess the text data and extract the entities of interest. These steps transform raw text into structured information that can be used for discovering relationships between biomedical concepts.

## Loading the Package

```{r load-package}
library(LBDiscover)
```

## Data Retrieval

First, let's retrieve some sample articles:

```{r retrieve-data, eval = TRUE}
# Search for articles about migraines
migraine_articles <- pubmed_search(
  query = "migraine pathophysiology",
  max_results = 100
)

# View the first article
head(migraine_articles[, c("pmid", "title")], 3)
```

## Basic Text Preprocessing

The first step is to preprocess the text data to extract meaningful terms:

```{r basic-preprocessing, eval = TRUE}
# Preprocess the abstracts
preprocessed_data <- preprocess_text(
  migraine_articles,
  text_column = "abstract",
  remove_stopwords = TRUE,
  custom_stopwords = c("study", "patient", "result", "conclusion"),
  min_word_length = 3,
  max_word_length = 25
)

# View terms extracted from the first document
head(preprocessed_data$terms[[1]], 10)
```

### Optimized Preprocessing for Large Datasets

For larger datasets, we can use the optimized vectorized preprocessing function:

```{r optimized-preprocessing, eval = TRUE}
# Use optimized vectorized preprocessing
opt_preprocessed_data <- vec_preprocess(
  migraine_articles,
  text_column = "abstract",
  remove_stopwords = TRUE,
  min_word_length = 3,
  chunk_size = 50  # Process in chunks of 50 documents
)

# Compare processing times
system.time({
  preprocess_text(
    migraine_articles,
    text_column = "abstract",
    remove_stopwords = TRUE
  )
})

system.time({
  vec_preprocess(
    migraine_articles,
    text_column = "abstract",
    remove_stopwords = TRUE,
    chunk_size = 50
  )
})
```

## Advanced Text Analysis

### N-gram Extraction

We can extract n-grams (sequences of n words) to capture multi-word concepts:

```{r ngram-extraction, eval = TRUE}
# Extract bigrams (2-word sequences)
bigrams <- extract_ngrams(
  migraine_articles$abstract,
  n = 2,
  min_freq = 2
)

# View the most frequent bigrams
head(bigrams, 10)
```

### Sentence Segmentation

Segmenting text into sentences can be useful for more granular analysis:

```{r sentence-segmentation, eval = TRUE}
# Extract sentences from the first abstract
abstracts <- migraine_articles$abstract
first_abstract <- abstracts[1]

# Make sure we have a valid abstract
if(is.na(first_abstract) || length(first_abstract) == 0 || nchar(first_abstract) == 0) {
  # Find the first non-empty abstract
  valid_idx <- which(!is.na(abstracts) & nchar(abstracts) > 0)
  if(length(valid_idx) > 0) {
    first_abstract <- abstracts[valid_idx[1]]
    cat("First abstract was empty, using abstract #", valid_idx[1], "instead.\n")
  } else {
    # Create a sample abstract for demonstration
    first_abstract <- "This is a sample abstract for demonstration. It contains multiple sentences. Each sentence will be extracted separately."
    cat("No valid abstracts found. Using a sample abstract for demonstration.\n")
  }
}

# Now segment the valid abstract
sentences <- segment_sentences(first_abstract)

# Check if sentences list has elements before trying to access them
if(length(sentences) > 0 && length(sentences[[1]]) > 0) {
  # View the first few sentences
  head(sentences[[1]], min(3, length(sentences[[1]])))
} else {
  cat("No sentences could be extracted. The abstract might be too short or formatted incorrectly.\n")
}

# View the first few sentences
head(sentences[[1]], 3)
```

### Language Detection

For dealing with multilingual corpora, we can detect the language of each document:

```{r language-detection, eval = TRUE}
# Filter out NA values from abstracts and detect language
abstracts <- migraine_articles$abstract[1:5]
valid_abstracts <- abstracts[!is.na(abstracts)]

# Apply language detection to valid abstracts
if (length(valid_abstracts) > 0) {
  languages <- sapply(valid_abstracts, detect_lang)
  
  # View results
  data.frame(
    abstract_id = which(!is.na(abstracts)),
    language = languages
  )
} else {
  message("No valid abstracts found for language detection")
}
```

## Entity Extraction

After preprocessing, the next step is to extract biomedical entities from the text.

### Loading Entity Dictionaries

First, let's load entity dictionaries that will be used for entity recognition:

```{r load-dictionaries, eval = TRUE}
# Load a disease dictionary
disease_dict <- load_dictionary(
  dictionary_type = "disease",
  source = "mesh"
)

# Load a drug dictionary
drug_dict <- load_dictionary(
  dictionary_type = "drug",
  source = "mesh"
)

# View a sample of each dictionary
head(disease_dict, 3)
head(drug_dict, 3)
```

### Basic Entity Extraction

Now we can extract entities from the text using these dictionaries:

```{r basic-entity-extraction, eval = TRUE}
# Extract disease and drug entities
entities <- extract_entities(
  preprocessed_data,
  text_column = "abstract",
  dictionary = rbind(disease_dict, drug_dict),
  case_sensitive = FALSE,
  overlap_strategy = "priority"
)

# View some extracted entities
head(entities[, c("doc_id", "entity", "entity_type", "sentence")], 10)
```

### Complete Entity Extraction Workflow

For a more comprehensive approach, we can use the complete entity extraction workflow:

```{r complete-entity-extraction, eval = TRUE}
# Extract entities using the complete workflow
# Check if running in R CMD check environment
is_check <- !interactive() && 
            (!is.null(Sys.getenv("R_CHECK_RUNNING")) && 
             Sys.getenv("R_CHECK_RUNNING") == "true")
             
# More robust check for testing environment
if (!is_check && !is.null(Sys.getenv("_R_CHECK_LIMIT_CORES_"))) {
  is_check <- TRUE
}

# Set number of cores based on environment
num_cores_to_use <- if(is_check) 1 else 4

# Extract entities using the complete workflow
entities_workflow <- extract_entities_workflow(
  preprocessed_data,
  text_column = "abstract",
  entity_types = c("disease", "drug", "gene", "protein", "pathway"),
  dictionary_sources = c("local", "mesh"),
  sanitize = TRUE,
  parallel = !is_check,           # Disable parallel in check environment
  num_cores = num_cores_to_use    # Use 1 core in check environment
)

# View summary of entity types
table(entities_workflow$entity_type)
```

### Customizing Entity Extraction

We can customize the entity extraction process by providing additional MeSH queries or custom dictionaries:

```{r custom-entity-extraction, eval = TRUE}
# Define custom MeSH queries for different entity types
mesh_queries <- list(
  "disease" = "migraine disorders[MeSH] OR headache disorders[MeSH]",
  "drug" = "analgesics[MeSH] OR serotonin agonists[MeSH] OR anticonvulsants[MeSH]",
  "gene" = "genes[MeSH] OR channelopathy[MeSH]"
)

# Create a custom dictionary
custom_dict <- data.frame(
  term = c("CGRP", "trigeminal nerve", "cortical spreading depression"),
  type = c("protein", "anatomy", "biological_process"),
  id = c("CUSTOM_1", "CUSTOM_2", "CUSTOM_3"),
  source = rep("custom", 3),
  stringsAsFactors = FALSE
)

# Extract entities with custom settings
custom_entities <- extract_entities_workflow(
  preprocessed_data,
  text_column = "abstract",
  entity_types = c("disease", "drug", "gene", "protein", "pathway"),
  dictionary_sources = c("local", "mesh"),
  additional_mesh_queries = mesh_queries,
  custom_dictionary = custom_dict,
  sanitize = TRUE
)

# View custom entities
custom_entities[custom_entities$source == "custom", ]
```

## Dictionary Sanitization

The quality of entity extraction heavily depends on the quality of the dictionaries. We can sanitize dictionaries to improve extraction quality:

```{r dictionary-sanitization, eval = TRUE}
# Create a raw dictionary with some problematic entries
raw_dict <- data.frame(
  term = c("migraine", "5-HT", "headache", "the", "and", "patient", "inflammation", "study"),
  type = c("disease", "chemical", "symptom", "NA", "NA", "NA", "biological_process", "NA"),
  id = paste0("ID_", 1:8),
  source = rep("example", 8),
  stringsAsFactors = FALSE
)

# Sanitize the dictionary
sanitized_dict <- sanitize_dictionary(
  raw_dict,
  term_column = "term",
  type_column = "type",
  validate_types = TRUE,
  verbose = TRUE
)

# View the sanitized dictionary
sanitized_dict
```

## Mapping Terms to Biomedical Ontologies

We can map extracted terms to standard biomedical ontologies like MeSH or UMLS:

```{r ontology-mapping, eval = TRUE}
# Extract terms to map
terms_to_map <- c("migraine", "headache", "CGRP", "serotonin")

# Map to MeSH
mesh_mappings <- map_ontology(
  terms_to_map,
  ontology = "mesh",
  fuzzy_match = TRUE,
  similarity_threshold = 0.8
)

# View MeSH mappings
mesh_mappings
```

## Topic Modeling

We can also apply topic modeling to discover the main themes in the corpus:

```{r topic-modeling, eval = TRUE}
# Extract topics from the corpus
topics <- extract_topics(
  migraine_articles,
  text_column = "abstract",
  n_topics = 5,
  max_terms = 10
)

# View top terms for each topic
topics$topics
```