---
title: "Working with Discovery Models"
author: "Chao Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Working with Discovery Models}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
```

# Discovery Models in Literature-Based Discovery

This vignette explores the various discovery models available in the `LBDiscover` package, which enable you to identify hidden connections in the biomedical literature.

## Overview of Discovery Models

The `LBDiscover` package implements several discovery models:

1. **ABC Model**: The classic model that discovers implicit connections between terms A and C through intermediate B terms.

2. **AnC Model**: A variant of the ABC model that uses multiple B terms to establish stronger connections between A and C.

3. **BITOLA Model**: A model that incorporates semantic types to guide the discovery process.

4. **LSI (Latent Semantic Indexing) Model**: A model that uses dimensionality reduction to discover semantically related terms.

Each model has its strengths and is suitable for different types of discovery tasks.

## Setup and Data Preparation

Let's start by loading the package and preparing some sample data:

```{r load-package, message=FALSE}
library(LBDiscover)
```

```{r prepare-data, eval = TRUE, message=FALSE}
# Search for articles about Alzheimer's disease
alzheimer_articles <- pubmed_search(
  query = "alzheimer pathophysiology",
  max_results = 1000
)

# Search for treatment-related articles
drug_articles <- pubmed_search(
  query = "neurodegenerative disease treatment OR cognitive impairment therapy",
  max_results = 1000
)

# Combine and remove duplicates
all_articles <- merge_results(alzheimer_articles, drug_articles)
print(paste("Retrieved", nrow(all_articles), "unique articles"))

# Preprocess text
preprocessed_articles <- preprocess_text(
  all_articles,
  text_column = "abstract",
  remove_stopwords = TRUE,
  min_word_length = 3
)

# Extract entities
entities <- extract_entities_workflow(
  preprocessed_articles,
  text_column = "abstract",
  entity_types = c("disease", "drug", "gene", "protein", "chemical", "pathway"),
  dictionary_sources = c("local", "mesh")
)

# Create co-occurrence matrix with less restrictive normalization
co_matrix <- create_comat(
  entities,
  doc_id_col = "doc_id",
  entity_col = "entity",
  type_col = "entity_type",
  normalize = TRUE,
  normalization_method = "dice"  # Using dice coefficient for better sensitivity
)

# Define our primary term of interest
a_term <- "alzheimer"

# Let's first examine what terms are available in the co-occurrence matrix
available_terms <- rownames(co_matrix)
message("First 30 terms in the co-occurrence matrix:")
print(head(available_terms, 30))

# Check if the term is misspelled or slightly different
message("\nChecking for variations of 'alzheimer' and related terms:")
alzheimer_like <- grep("alzheim|dementia|cognitive", available_terms, value = TRUE, ignore.case = TRUE)
print(alzheimer_like)

# Find our primary term in the co-occurrence matrix with flexible matching
if (!a_term %in% rownames(co_matrix)) {
  # First, try to find terms containing 'alzheimer'
  possible_matches <- alzheimer_like
  
  if (length(possible_matches) > 0) {
    message("Primary term '", a_term, "' not found exactly. Using related term: ", possible_matches[1])
    a_term <- possible_matches[1]
  } else {
    # If no Alzheimer terms, try some other neurodegenerative disease terms
    neuro_terms <- grep("neurodegenerat|parkinson|huntington|dementia", rownames(co_matrix), value = TRUE, ignore.case = TRUE)
    if (length(neuro_terms) > 0) {
      message("No Alzheimer terms found. Using alternative neurodegenerative disease term: ", neuro_terms[1])
      a_term <- neuro_terms[1]
    } else {
      # If still no match, let's use any disease term for demonstration
      disease_terms <- grep("disease", rownames(co_matrix), value = TRUE, ignore.case = TRUE)
      if (length(disease_terms) > 0) {
        message("No neurodegenerative terms found. Using general disease term for demonstration: ", disease_terms[1])
        a_term <- disease_terms[1]
      } else {
        # Last resort - take any term marked as disease in the entity types
        if (!is.null(attr(co_matrix, "entity_types"))) {
          disease_entities <- names(attr(co_matrix, "entity_types"))[attr(co_matrix, "entity_types") == "disease"]
          if (length(disease_entities) > 0) {
            a_term <- disease_entities[1]
            message("Using first disease entity: ", a_term)
          } else {
            stop("No suitable disease terms found in the co-occurrence matrix")
          }
        } else {
          stop("No terms found in the co-occurrence matrix")
        }
      }
    }
  }
} else {
  message("Found exact match for primary term")
}

# Check matrix dimensions
dim(co_matrix)
```

## 1. Classic ABC Model

The ABC model identifies potential connections between term A (your starting point) and term C (potential discoveries) through shared B terms.

```{r abc-model, eval = TRUE}
# Apply the ABC model with lower thresholds
abc_results <- abc_model(
  co_matrix,
  a_term = a_term,
  min_score = 0.0001,  # Much lower threshold
  n_results = 500  # Get more results
)

# View top results
if (nrow(abc_results) > 0) {
  head(abc_results[, c("a_term", "b_term", "c_term", "abc_score")])
} else {
  message("No results from ABC model")
}
```

### Statistical Validation of ABC Results

We can apply statistical validation to evaluate the significance of the discovered connections:

```{r validate-abc, eval = TRUE}
validated_results <- validate_abc(
  abc_results,
  co_matrix,
  alpha = 0.05,
  correction = "BH"
)

# View statistically significant connections
significant_results <- validated_results[validated_results$significant, ]
if (nrow(significant_results) > 0) {
  head(significant_results[, c("a_term", "b_term", "c_term", "abc_score", "p_value")])
} else {
  message("No statistically significant results found")
}
```

## 2. AnC Model

The AnC model extends the ABC model by considering multiple intermediate B terms:

```{r anc-model, eval = TRUE}
# Apply the AnC model with lower thresholds
anc_results <- anc_model(
  co_matrix,
  a_term = a_term,
  n_b_terms = 10,  # Consider more B terms
  c_type = NULL,  # Allow all entity types, not just drugs
  min_score = 0.0001,  # Lower threshold
  n_results = 500  # Get more results
)

# View top results
if (nrow(anc_results) > 0) {
  head(anc_results[, c("a_term", "b_terms", "c_term", "anc_score")])
} else {
  message("No results from AnC model")
}
```

## 3. BITOLA Model

The BITOLA model incorporates semantic types to guide the discovery process:

```{r bitola-model, eval = TRUE}
# Apply the BITOLA model with appropriate semantic types
bitola_results <- bitola_model(
  co_matrix,
  a_term = a_term,
  a_semantic_type = "disease",    # Set semantic type for source term
  c_semantic_type = "drug",       # Set semantic type for target terms
  min_score = 0.0001,  # Lower threshold
  n_results = 500  # Get more results
)

# View top results
if (nrow(bitola_results) > 0) {
  head(bitola_results[, c("a_term", "c_term", "support", "bitola_score", "ranking_score")])
} else {
  message("No results from BITOLA model")
}
```

## 4. LSI Model (Latent Semantic Indexing)

The LSI model uses dimensionality reduction to discover semantically related terms:

```{r lsi-model, eval = TRUE, message=FALSE}
# Create a term-document matrix for LSI model
tdm <- create_tdm(preprocessed_articles)

# Apply the LSI model
lsi_results <- lsi_model(
  tdm,
  a_term = a_term,
  n_factors = 100,  # Number of latent factors
  n_results = 500
)

# View top results
if (nrow(lsi_results) > 0) {
  head(lsi_results[, c("a_term", "c_term", "lsi_similarity")])
} else {
  message("No results from LSI model")
}
```

## Comparing Results from Different Models

It can be insightful to compare the results from different discovery models:

```{r compare-models, eval = TRUE}
# Extract C terms from each model
abc_c_terms <- if (nrow(abc_results) > 0) unique(abc_results$c_term) else character(0)
anc_c_terms <- if (nrow(anc_results) > 0) unique(anc_results$c_term) else character(0)
bitola_c_terms <- if (nrow(bitola_results) > 0) unique(bitola_results$c_term) else character(0)
lsi_c_terms <- if (nrow(lsi_results) > 0) unique(lsi_results$c_term) else character(0)

# Find common C terms across models
common_abc_anc <- intersect(abc_c_terms, anc_c_terms)
common_abc_bitola <- intersect(abc_c_terms, bitola_c_terms)
common_abc_lsi <- intersect(abc_c_terms, lsi_c_terms)
common_all <- Reduce(intersect, list(abc_c_terms, anc_c_terms, bitola_c_terms, lsi_c_terms))

# Print comparison
cat("Number of unique C terms found by each model:\n")
cat("  ABC model:", length(abc_c_terms), "\n")
cat("  AnC model:", length(anc_c_terms), "\n")
cat("  BITOLA model:", length(bitola_c_terms), "\n")
cat("  LSI model:", length(lsi_c_terms), "\n\n")

cat("Number of C terms found by multiple models:\n")
cat("  Common between ABC and AnC:", length(common_abc_anc), "\n")
cat("  Common between ABC and BITOLA:", length(common_abc_bitola), "\n")
cat("  Common between ABC and LSI:", length(common_abc_lsi), "\n")
cat("  Common across all four models:", length(common_all), "\n\n")

if (length(common_all) > 0) {
  cat("C terms found by all models:\n")
  print(common_all)
}
```