Working with Discovery Models

Discovery Models in Literature-Based Discovery

This vignette explores the various discovery models available in the LBDiscover package, which enable you to identify hidden connections in the biomedical literature.

Overview of Discovery Models

The LBDiscover package implements several discovery models:

ABC Model: The classic model that discovers implicit connections between terms A and C through intermediate B terms.
AnC Model: A variant of the ABC model that uses multiple B terms to establish stronger connections between A and C.
BITOLA Model: A model that incorporates semantic types to guide the discovery process.
LSI (Latent Semantic Indexing) Model: A model that uses dimensionality reduction to discover semantically related terms.

Each model has its strengths and is suitable for different types of discovery tasks.

Setup and Data Preparation

Let’s start by loading the package and preparing some sample data:

library(LBDiscover)

# Search for articles about Alzheimer's disease
alzheimer_articles <- pubmed_search(
  query = "alzheimer pathophysiology",
  max_results = 1000
)

# Search for treatment-related articles
drug_articles <- pubmed_search(
  query = "neurodegenerative disease treatment OR cognitive impairment therapy",
  max_results = 1000
)

# Combine and remove duplicates
all_articles <- merge_results(alzheimer_articles, drug_articles)
print(paste("Retrieved", nrow(all_articles), "unique articles"))
#> [1] "Retrieved 1955 unique articles"

# Preprocess text
preprocessed_articles <- preprocess_text(
  all_articles,
  text_column = "abstract",
  remove_stopwords = TRUE,
  min_word_length = 3
)

# Extract entities
entities <- extract_entities_workflow(
  preprocessed_articles,
  text_column = "abstract",
  entity_types = c("disease", "drug", "gene", "protein", "chemical", "pathway"),
  dictionary_sources = c("local", "mesh")
)

# Create co-occurrence matrix with less restrictive normalization
co_matrix <- create_comat(
  entities,
  doc_id_col = "doc_id",
  entity_col = "entity",
  type_col = "entity_type",
  normalize = TRUE,
  normalization_method = "dice"  # Using dice coefficient for better sensitivity
)

# Define our primary term of interest
a_term <- "alzheimer"

# Let's first examine what terms are available in the co-occurrence matrix
available_terms <- rownames(co_matrix)
message("First 30 terms in the co-occurrence matrix:")
print(head(available_terms, 30))
#>  [1] "Receptors"      "Acid"           "migraine"       "fatigue"       
#>  [5] "cancer"         "Toxins"         "Nitric"         "Menopause"     
#>  [9] "headache"       "Aromatherapy"   "heparin"        "Taste"         
#> [13] "hypoxemia"      "Phytochemicals" "Anesthetics"    "Hemolysis"     
#> [17] "thrombosis"     "Pleurodesis"

# Check if the term is misspelled or slightly different
message("\nChecking for variations of 'alzheimer' and related terms:")
alzheimer_like <- grep("alzheim|dementia|cognitive", available_terms, value = TRUE, ignore.case = TRUE)
print(alzheimer_like)
#> character(0)

# Find our primary term in the co-occurrence matrix with flexible matching
if (!a_term %in% rownames(co_matrix)) {
  # First, try to find terms containing 'alzheimer'
  possible_matches <- alzheimer_like
  
  if (length(possible_matches) > 0) {
    message("Primary term '", a_term, "' not found exactly. Using related term: ", possible_matches[1])
    a_term <- possible_matches[1]
  } else {
    # If no Alzheimer terms, try some other neurodegenerative disease terms
    neuro_terms <- grep("neurodegenerat|parkinson|huntington|dementia", rownames(co_matrix), value = TRUE, ignore.case = TRUE)
    if (length(neuro_terms) > 0) {
      message("No Alzheimer terms found. Using alternative neurodegenerative disease term: ", neuro_terms[1])
      a_term <- neuro_terms[1]
    } else {
      # If still no match, let's use any disease term for demonstration
      disease_terms <- grep("disease", rownames(co_matrix), value = TRUE, ignore.case = TRUE)
      if (length(disease_terms) > 0) {
        message("No neurodegenerative terms found. Using general disease term for demonstration: ", disease_terms[1])
        a_term <- disease_terms[1]
      } else {
        # Last resort - take any term marked as disease in the entity types
        if (!is.null(attr(co_matrix, "entity_types"))) {
          disease_entities <- names(attr(co_matrix, "entity_types"))[attr(co_matrix, "entity_types") == "disease"]
          if (length(disease_entities) > 0) {
            a_term <- disease_entities[1]
            message("Using first disease entity: ", a_term)
          } else {
            stop("No suitable disease terms found in the co-occurrence matrix")
          }
        } else {
          stop("No terms found in the co-occurrence matrix")
        }
      }
    }
  }
} else {
  message("Found exact match for primary term")
}

# Check matrix dimensions
dim(co_matrix)
#> [1] 18 18

1. Classic ABC Model

The ABC model identifies potential connections between term A (your starting point) and term C (potential discoveries) through shared B terms.

# Apply the ABC model with lower thresholds
abc_results <- abc_model(
  co_matrix,
  a_term = a_term,
  min_score = 0.0001,  # Much lower threshold
  n_results = 500  # Get more results
)
#> Filtered out 0 B terms that were too similar to A term (similarity threshold: 0.8)
#> Identifying potential C terms via 6 B terms...
#>   |                                                                              |                                                                      |   0%  |                                                                              |============                                                          |  17%  |                                                                              |=======================                                               |  33%  |                                                                              |===================================                                   |  50%  |                                                                              |===============================================                       |  67%  |                                                                              |==========================================================            |  83%  |                                                                              |======================================================================| 100%

# View top results
if (nrow(abc_results) > 0) {
  head(abc_results[, c("a_term", "b_term", "c_term", "abc_score")])
} else {
  message("No results from ABC model")
}
#>           a_term    b_term   c_term    abc_score
#> fatigue   cancer   fatigue headache 0.0005243055
#> Receptors cancer Receptors migraine 0.0005235824

Statistical Validation of ABC Results

We can apply statistical validation to evaluate the significance of the discovered connections:

validated_results <- validate_abc(
  abc_results,
  co_matrix,
  alpha = 0.05,
  correction = "BH"
)
#> Using optimized approach for large matrix validation...
#> Using metadata for document count: 271
#> Calculating statistical significance using hypergeometric test...
#> 0.0% of connections are statistically significant (p < 0.05, BH correction)

# View statistically significant connections
significant_results <- validated_results[validated_results$significant, ]
if (nrow(significant_results) > 0) {
  head(significant_results[, c("a_term", "b_term", "c_term", "abc_score", "p_value")])
} else {
  message("No statistically significant results found")
}
#> No statistically significant results found

2. AnC Model

The AnC model extends the ABC model by considering multiple intermediate B terms:

# Apply the AnC model with lower thresholds
anc_results <- anc_model(
  co_matrix,
  a_term = a_term,
  n_b_terms = 10,  # Consider more B terms
  c_type = NULL,  # Allow all entity types, not just drugs
  min_score = 0.0001,  # Lower threshold
  n_results = 500  # Get more results
)
#> Validating biomedical relevance of B terms...
#> Retained 5 biomedically relevant B terms after filtering
#> Validating biomedical relevance of C terms...
#> Retained 12 biomedically relevant C terms after filtering
#> Analyzing 12 potential C terms...
#>   |                                                                              |                                                                      |   0%  |                                                                              |======                                                                |   8%  |                                                                              |============                                                          |  17%  |                                                                              |==================                                                    |  25%  |                                                                              |=======================                                               |  33%  |                                                                              |=============================                                         |  42%  |                                                                              |===================================                                   |  50%  |                                                                              |=========================================                             |  58%  |                                                                              |===============================================                       |  67%  |                                                                              |====================================================                  |  75%  |                                                                              |==========================================================            |  83%  |                                                                              |================================================================      |  92%  |                                                                              |======================================================================| 100%
#> No AnC connections found

# View top results
if (nrow(anc_results) > 0) {
  head(anc_results[, c("a_term", "b_terms", "c_term", "anc_score")])
} else {
  message("No results from AnC model")
}
#> No results from AnC model

3. BITOLA Model

The BITOLA model incorporates semantic types to guide the discovery process:

# Apply the BITOLA model with appropriate semantic types
bitola_results <- bitola_model(
  co_matrix,
  a_term = a_term,
  a_semantic_type = "disease",    # Set semantic type for source term
  c_semantic_type = "drug",       # Set semantic type for target terms
  min_score = 0.0001,  # Lower threshold
  n_results = 500  # Get more results
)
#> Analyzing 6 B terms...
#>   |                                                                              |                                                                      |   0%  |                                                                              |============                                                          |  17%  |                                                                              |=======================                                               |  33%  |                                                                              |===================================                                   |  50%  |                                                                              |===============================================                       |  67%  |                                                                              |==========================================================            |  83%  |                                                                              |======================================================================| 100%
#> No BITOLA connections found

# View top results
if (nrow(bitola_results) > 0) {
  head(bitola_results[, c("a_term", "c_term", "support", "bitola_score", "ranking_score")])
} else {
  message("No results from BITOLA model")
}
#> No results from BITOLA model

4. LSI Model (Latent Semantic Indexing)

The LSI model uses dimensionality reduction to discover semantically related terms:

# Create a term-document matrix for LSI model
tdm <- create_tdm(preprocessed_articles)

# Apply the LSI model
lsi_results <- lsi_model(
  tdm,
  a_term = a_term,
  n_factors = 100,  # Number of latent factors
  n_results = 500
)

# View top results
if (nrow(lsi_results) > 0) {
  head(lsi_results[, c("a_term", "c_term", "lsi_similarity")])
} else {
  message("No results from LSI model")
}
#>   a_term         c_term lsi_similarity
#> 1 cancer adenocarcinoma      0.9524903
#> 2 cancer      carcinoma      0.9492129
#> 3 cancer        peptide      0.1920824
#> 4 cancer      secretase      0.1713798
#> 5 cancer        disease      0.1549414
#> 6 cancer           base      0.1500490

Comparing Results from Different Models

It can be insightful to compare the results from different discovery models:

# Extract C terms from each model
abc_c_terms <- if (nrow(abc_results) > 0) unique(abc_results$c_term) else character(0)
anc_c_terms <- if (nrow(anc_results) > 0) unique(anc_results$c_term) else character(0)
bitola_c_terms <- if (nrow(bitola_results) > 0) unique(bitola_results$c_term) else character(0)
lsi_c_terms <- if (nrow(lsi_results) > 0) unique(lsi_results$c_term) else character(0)

# Find common C terms across models
common_abc_anc <- intersect(abc_c_terms, anc_c_terms)
common_abc_bitola <- intersect(abc_c_terms, bitola_c_terms)
common_abc_lsi <- intersect(abc_c_terms, lsi_c_terms)
common_all <- Reduce(intersect, list(abc_c_terms, anc_c_terms, bitola_c_terms, lsi_c_terms))

# Print comparison
cat("Number of unique C terms found by each model:\n")
#> Number of unique C terms found by each model:
cat("  ABC model:", length(abc_c_terms), "\n")
#>   ABC model: 2
cat("  AnC model:", length(anc_c_terms), "\n")
#>   AnC model: 0
cat("  BITOLA model:", length(bitola_c_terms), "\n")
#>   BITOLA model: 0
cat("  LSI model:", length(lsi_c_terms), "\n\n")
#>   LSI model: 185

cat("Number of C terms found by multiple models:\n")
#> Number of C terms found by multiple models:
cat("  Common between ABC and AnC:", length(common_abc_anc), "\n")
#>   Common between ABC and AnC: 0
cat("  Common between ABC and BITOLA:", length(common_abc_bitola), "\n")
#>   Common between ABC and BITOLA: 0
cat("  Common between ABC and LSI:", length(common_abc_lsi), "\n")
#>   Common between ABC and LSI: 0
cat("  Common across all four models:", length(common_all), "\n\n")
#>   Common across all four models: 0

if (length(common_all) > 0) {
  cat("C terms found by all models:\n")
  print(common_all)
}

Chao Liu

2025-05-15