{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Epilepsy Comorbidity Analysis using SCAIView\n", "\n", "This notebook contains the Quantification of gene overlap comparing Epilepsy with other disorders using text mining presented in Hoyt and Domingo-Fernandez *et. al*, 2018." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import os\n", "import sys\n", "import time\n", "\n", "from operator import itemgetter\n", "\n", "from matplotlib import pyplot as plt\n", "from matplotlib_venn import venn3, venn2\n", "import numpy as np\n", "import pandas as pd\n", "import scipy.stats as stats\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "%config InlineBackend.figure_format = 'svg'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.6.5 (default, Apr 20 2018, 08:54:42) \n", "[GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]\n" ] } ], "source": [ "print(sys.version)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fri Apr 20 11:07:01 2018\n" ] } ], "source": [ "print(time.asctime())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "current_path = os.getcwd() # Notebook abs path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Queries\n", "\n", "The following two sets of queries were used in this analysis:\n", "\n", "## Reference Queries\n", "\n", "- `[MeSH Disease:\"Epilepsy\"]`\n", "- `[MeSH Disease:\"Alzheimer Disease\"]`\n", "- `[MeSH Disease:\"Tuberculosis\"]`\n", "- `[MeSH Disease:\"Parkinson Disease\"]`\n", "- `[MeSH Disease:\"Dementia\"]`\n", "- `[MeSH Disease:\"Migraine Disorders\"]`\n", "- `[MeSH Disease:\"Diabetes Mellitus\"]`\n", "- `[MeSH Disease:\"Pulmonary Disease Chronic Obstructive\"]`\n", "- `[MeSH Disease:\"Peptic Ulcer\"]`\n", "- `[MeSH Disease:\"Anxiety Disorders\"]`\n", "- `[MeSH Disease:\"Urinary Incontinence\"]`\n", "- `[MeSH Disease:\"Cataract\"]`\n", "- `[MeSH Disease:\"Hypertension\"]`\n", "- `[MeSH Disease:\"Arthritis\"]`\n", "- `[MeSH Disease:\"Asthma\"]`\n", "- `[MeSH Disease:\"Bronchitis Chronic\"]`\n", "- `[MeSH Disease:\"Emphysema\"]`\n", "- `[MeSH Disease:\"Fibromyalgia\"]`\n", "- `[MeSH Disease:\"Glaucoma\"]`\n", "- `[MeSH Disease:\"Intestinal Diseases\"]`\n", "- `[MeSH Disease:\"Thyroid Diseases\"]`\n", "- `[MeSH Disease:\"Depressive Disorder, Major\"]`\n", "- `[MeSH Disease:\"Back Pain\"]`\n", "- `[MeSH Disease:\"Stroke\"]`\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Queries used for calculating pleitropy rates\n", "\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Alzheimer Disease\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Parkinson Disease\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Dementia\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Migraine Disorders\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Diabetes Mellitus\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Pulmonary Disease Chronic Obstructive\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Anxiety Disorders\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Urinary Incontinence\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Cataract\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Hypertension\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Arthritis\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Asthma\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Bronchitis Chronic\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Emphysema\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Fibromyalgia\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Glaucoma\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Intestinal Diseases\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Thyroid Diseases\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Depressive Disorder, Major\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Back Pain\"]`\n", "- `[MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Stroke\"]`\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The queries were retrieved using SCAIView version 1.7.3\n", "Corresponding to the indexing of MEDLINE on 2016-07-14T13:50:07.797575Z.\n", "\n", "*Note that the reference queries might take time since thousand of articles need to be analyzed." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Notebook results" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ConditionMeSH IDReference QueryAssociated documentsDisease-associated genesComorbidity-associated genesNormalized pleitropy rate (%)
0EpilepsyD004827[MeSH Disease:\"Epilepsy\"]1922452901.000--
1StrokeD020521[MeSH Disease:\"Stroke\"]2108464533.00063317.78
2Alzheimer's DiseaseD000544[MeSH Disease:\"Alzheimer Disease\"]1094954968.00039613.65
3MigraineD008881[MeSH Disease:\"Migraine Disorders\"]309281230.00030610.54
4Parkinson's DiseaseD010300[MeSH Disease:\"Parkinson Disease\"]791033646.0002588.89
5HypertensionD006973[MeSH Disease:\"Hypertension\"]3911905574.0002528.68
6DementiaD003704[MeSH Disease:\"Dementia\"]1838025833.0002207.58
7Diabetes MellitusD003920[MeSH Disease:\"Diabetes Mellitus\"]3944116661.0001846.34
8Intestinal DiseasesD007410[MeSH Disease:\"Intestinal Diseases\"]6296919.0931665.72
9Thyroid DiseasesD013959[MeSH Disease:\"Thyroid Diseases\"]1530254366.0001334.58
10AnxietyD001007[MeSH Disease:\"Anxiety Disorders\"]841381782.0001244.27
11ArthritisD001168[MeSH Disease:\"Arthritis\"]2593275367.0001224.2
12CataractD002386[MeSH Disease:\"Cataract\"]521502238.0001194.1
13AsthmaD001249[MeSH Disease:\"Asthma\"]1476973761.000862.96
14GlaucomaD005901[MeSH Disease:\"Glaucoma\"]566792303.000481.65
15Depressive Disorder, MajorD003865[MeSH Disease:\"Depressive Disorder, Major\"]157061249.000461.58
16Urinary IncontinenceD014549[MeSH Disease:\"Urinary Incontinence\"]34170720.000240.82
17Peptic UlcerD010437[MeSH Disease:\"Peptic Ulcer\"]682341445.000210.72
18Back PainD001416[MeSH Disease:\"Back Pain\"]485161191.000170.58
19Pulmonary Disease, Chronic ObstructiveD029424[MeSH Disease:\"Pulmonary Disease Chronic Obstr...356272244.000150.51
20FibromyalgiaD005356[MeSH Disease:\"Fibromyalgia\"]9021468.000100.34
21EmphysemaD004646[MeSH Disease:\"Emphysema\"]255111261.00090.31
22Bronchitis ChronicD029481[MeSH Disease:\"Bronchitis Chronic\"]9085580.00020.06
\n", "
" ], "text/plain": [ " Condition MeSH ID \\\n", "0 Epilepsy D004827 \n", "1 Stroke D020521 \n", "2 Alzheimer's Disease D000544 \n", "3 Migraine D008881 \n", "4 Parkinson's Disease D010300 \n", "5 Hypertension D006973 \n", "6 Dementia D003704 \n", "7 Diabetes Mellitus D003920 \n", "8 Intestinal Diseases D007410 \n", "9 Thyroid Diseases D013959 \n", "10 Anxiety D001007 \n", "11 Arthritis D001168 \n", "12 Cataract D002386 \n", "13 Asthma D001249 \n", "14 Glaucoma D005901 \n", "15 Depressive Disorder, Major D003865 \n", "16 Urinary Incontinence D014549 \n", "17 Peptic Ulcer D010437 \n", "18 Back Pain D001416 \n", "19 Pulmonary Disease, Chronic Obstructive D029424 \n", "20 Fibromyalgia D005356 \n", "21 Emphysema D004646 \n", "22 Bronchitis Chronic D029481 \n", "\n", " Reference Query Associated documents \\\n", "0 [MeSH Disease:\"Epilepsy\"] 192245 \n", "1 [MeSH Disease:\"Stroke\"] 210846 \n", "2 [MeSH Disease:\"Alzheimer Disease\"] 109495 \n", "3 [MeSH Disease:\"Migraine Disorders\"] 30928 \n", "4 [MeSH Disease:\"Parkinson Disease\"] 79103 \n", "5 [MeSH Disease:\"Hypertension\"] 391190 \n", "6 [MeSH Disease:\"Dementia\"] 183802 \n", "7 [MeSH Disease:\"Diabetes Mellitus\"] 394411 \n", "8 [MeSH Disease:\"Intestinal Diseases\"] 629691 \n", "9 [MeSH Disease:\"Thyroid Diseases\"] 153025 \n", "10 [MeSH Disease:\"Anxiety Disorders\"] 84138 \n", "11 [MeSH Disease:\"Arthritis\"] 259327 \n", "12 [MeSH Disease:\"Cataract\"] 52150 \n", "13 [MeSH Disease:\"Asthma\"] 147697 \n", "14 [MeSH Disease:\"Glaucoma\"] 56679 \n", "15 [MeSH Disease:\"Depressive Disorder, Major\"] 15706 \n", "16 [MeSH Disease:\"Urinary Incontinence\"] 34170 \n", "17 [MeSH Disease:\"Peptic Ulcer\"] 68234 \n", "18 [MeSH Disease:\"Back Pain\"] 48516 \n", "19 [MeSH Disease:\"Pulmonary Disease Chronic Obstr... 35627 \n", "20 [MeSH Disease:\"Fibromyalgia\"] 9021 \n", "21 [MeSH Disease:\"Emphysema\"] 25511 \n", "22 [MeSH Disease:\"Bronchitis Chronic\"] 9085 \n", "\n", " Disease-associated genes Comorbidity-associated genes \\\n", "0 2901.000 - \n", "1 4533.000 633 \n", "2 4968.000 396 \n", "3 1230.000 306 \n", "4 3646.000 258 \n", "5 5574.000 252 \n", "6 5833.000 220 \n", "7 6661.000 184 \n", "8 9.093 166 \n", "9 4366.000 133 \n", "10 1782.000 124 \n", "11 5367.000 122 \n", "12 2238.000 119 \n", "13 3761.000 86 \n", "14 2303.000 48 \n", "15 1249.000 46 \n", "16 720.000 24 \n", "17 1445.000 21 \n", "18 1191.000 17 \n", "19 2244.000 15 \n", "20 468.000 10 \n", "21 1261.000 9 \n", "22 580.000 2 \n", "\n", " Normalized pleitropy rate (%) \n", "0 - \n", "1 17.78 \n", "2 13.65 \n", "3 10.54 \n", "4 8.89 \n", "5 8.68 \n", "6 7.58 \n", "7 6.34 \n", "8 5.72 \n", "9 4.58 \n", "10 4.27 \n", "11 4.2 \n", "12 4.1 \n", "13 2.96 \n", "14 1.65 \n", "15 1.58 \n", "16 0.82 \n", "17 0.72 \n", "18 0.58 \n", "19 0.51 \n", "20 0.34 \n", "21 0.31 \n", "22 0.06 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(os.path.join(current_path, 'results', 'gene_overlap_results.tsv'), sep='\\t')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Table 1. Results of the Epilepsy Comorbidity Analysis using SCAIView. \n", "\n", "Description of each column:\n", "\n", "Column 1. Disease.\n", "\n", "Column 2. Reference query for the disease.\n", "\n", "Column 3. Number of documents retrieved using the disease reference query.\n", "\n", "Column 4. Total number of genes found in the corpus retrieved with the reference query for the disease.\n", "\n", "Column 5. Number of genes with a relative entropy greater than 0 retrieved from a query containing the disease of interest and epilepsy. An example for diabetes would use the following query: [MeSH Disease:\"Epilepsy\"] AND [MeSH Disease:\"Diabetes Mellitus\"] and the corpus would contain articles that mention Epilepsy and Diabetes. The relative entropy is calculated using the occurrence of genes/proteins within this query and comparing with their occurrence in MEDLINE.\n", "\n", "Column 6. Normalized pleitropy rate. Overlap of genes in comparison with the Epilepsy geneset (total of 2901 genes) containing genes with a relative entropy greater than 0 using the Epilepsy reference query [MeSH Disease:\"Epilepsy\"] (192245 documents)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Load resources" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "\n", "resources_path = os.path.join(current_path, \"resources\")\n", "results_path = os.path.join(current_path, \"results\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Import the results from the combined queries (epilepsy + comorbidity)\n", "\n", "#### International Classification of Diseases (ICD) groups from Keezer et al., 2016 \n", "#### (https://www.ncbi.nlm.nih.gov/pubmed/26549780)\n", "\n", "# Epilepsy reference query\n", "epilepsy_path = os.path.join(resources_path, 'epilepsy.csv')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "comorbidities = []\n", "\n", "# ICD chapter IV: endocrine, nutritional, and metabolic diseases\n", "comorbidities.append((\"Diabetes\", os.path.join(resources_path, 'diabetes_epilepsy.csv'))) \n", "comorbidities.append((\"Thyroid diseases\", os.path.join(resources_path, 'thyroid_diseases_epilepsy.csv'))) \n", "\n", "# ICD chapter V: mental and behavioural disorders\n", "comorbidities.append((\"Anxiety\", os.path.join(resources_path, 'anxiety_epilepsy.csv'))) \n", "comorbidities.append((\"Major Depression\", os.path.join(resources_path, 'major_depression_epilepsy.csv'))) \n", "\n", "# ICD chapter VI: nervous system\n", "comorbidities.append((\"Alzheimer's disease\", os.path.join(resources_path, 'alzheimers_epilepsy.csv'))) \n", "comorbidities.append((\"Parkinson's disease\", os.path.join(resources_path, 'parkinson_epilepsy.csv'))) # parkinson's is included since we have a Knowledge Assembly\n", "comorbidities.append((\"Migraine\", os.path.join(resources_path, 'migraine_epilepsy.csv'))) \n", "comorbidities.append((\"Dementia\", os.path.join(resources_path, 'dementia_epilepsy.csv'))) \n", "\n", "# ICD chapter VII: eye and adnexa\n", "comorbidities.append((\"Cataracts\", os.path.join(resources_path, 'cataracts_epilepsy.csv'))) \n", "comorbidities.append((\"Glaucoma\", os.path.join(resources_path, 'glaucoma_epilepsy.csv'))) \n", "\n", "# ICD chapter IX: circulary system\n", "comorbidities.append((\"Hypertension\", os.path.join(resources_path, 'hypertension_epilepsy.csv'))) \n", "comorbidities.append((\"Stroke\", os.path.join(resources_path, 'stroke_epilepsy.csv'))) \n", "\n", "# ICD chapter X: respiratory system\n", "comorbidities.append((\"Pulmonary Disease Chronic Obstructive\", os.path.join(resources_path, 'copd_epilepsy.csv'))) # Pulmonary Disease Chronic Obstructive\n", "comorbidities.append((\"Chronic bronchitis\", os.path.join(resources_path, 'bronchitis_chronic_epilepsy.csv'))) \n", "comorbidities.append((\"Emphysema\", os.path.join(resources_path, 'emphysema_epilepsy.csv'))) \n", "comorbidities.append((\"Asthma\", os.path.join(resources_path, 'asthma_epilepsy.csv'))) \n", "\n", "# ICD chapter XI:digestive system\n", "comorbidities.append((\"Peptic ulcers\", os.path.join(resources_path, 'peptic_ulcer_epilepsy.csv')))\n", "comorbidities.append((\"Bowel diseases\", os.path.join(resources_path, 'intestinal_diseases.csv'))) \n", "\n", "# ICD chapter XIII: musculoskeletal system and connective tissues\n", "comorbidities.append((\"Fibromyalgia\", os.path.join(resources_path, 'fibromyalgia_epilepsy.csv'))) \n", "comorbidities.append((\"Arthritis\", os.path.join(resources_path, 'arthritis_epilepsy.csv'))) \n", "comorbidities.append((\"Back pain\", os.path.join(resources_path, 'back_pain_epilepsy.csv'))) \n", "\n", "# ICD chapter XIV: genitourinary system\n", "comorbidities.append((\"Urinary incontinence\", os.path.join(resources_path, 'urinary_incontinence_epilepsy.csv'))) " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Result table\n", "result_table = os.path.join(results_path, 'gene_overlap_results.tsv')\n", "result_df = pd.read_csv(result_table, sep='\\t')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Parsing result files from SCAIView\n", "\n", "First column structure: Common Name;Internal Identifier;Relative Entropy;Reference Entity Count;Entity Count;Query Entity Count;\n", "\n", "HGNC names and relative entropy greater than 0 will only be extracted\n", "\n", "It seems to be a problem with the structure of the exported csv file because pandas is not able to import it" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def parser_scaiview_csv(path):\n", " \"\"\"\n", " \n", " :param str path:\n", " \"\"\"\n", " gene_dict = {}\n", "\n", " with open(path, \"r\") as f:\n", " reader = csv.reader(f, delimiter=\"\\t\")\n", " for i, line in enumerate(reader):\n", " \n", " line = [\n", " word\n", " for word in line[0].split(';')\n", " ]\n", " \n", " # Skip the header (csv structure is corrupt)\n", " # Only take into consideration genes whose relative entropy score is greater than 0\n", " if i == 0 or float(line[2]) < 0:\n", " continue\n", " \n", " # Populate {Gene: Relative entropy dictionary} \n", " gene_dict[line[0]] = line[2]\n", " \n", " return gene_dict\n", "\n", "def print_results(disease, reference_disease, size_geneset, size_reference):\n", " \"\"\"Print the results\n", " \n", " :param disease str: disease name\n", " :param reference_disease str: reference disease to compare\n", " :param size_geneset int: number of genes in query disease + reference\n", " :param size_reference int: number of genes found in reference query\n", " \"\"\"\n", " print('{} + {} results: geneset size {} - pleitropy rate: {}\\n'.format(\n", " disease,\n", " reference_disease,\n", " size_geneset,\n", " size_geneset / size_reference\n", " )\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Overlap between genesets with epilepsy" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reference Epilepsy geneset size 2901 \n", "\n", "Pleitropy rates \n", "\n", "Stroke + Epilepsy results: geneset size 516 - pleitropy rate: 0.1778697001034126\n", "\n", "Alzheimer's disease + Epilepsy results: geneset size 396 - pleitropy rate: 0.13650465356773525\n", "\n", "Migraine + Epilepsy results: geneset size 306 - pleitropy rate: 0.10548086866597725\n", "\n", "Parkinson's disease + Epilepsy results: geneset size 258 - pleitropy rate: 0.0889348500517063\n", "\n", "Hypertension + Epilepsy results: geneset size 252 - pleitropy rate: 0.08686659772492245\n", "\n", "Dementia + Epilepsy results: geneset size 220 - pleitropy rate: 0.07583591864874181\n", "\n", "Diabetes + Epilepsy results: geneset size 184 - pleitropy rate: 0.0634264046880386\n", "\n", "Bowel diseases + Epilepsy results: geneset size 166 - pleitropy rate: 0.05722164770768701\n", "\n", "Thyroid diseases + Epilepsy results: geneset size 133 - pleitropy rate: 0.04584625991037573\n", "\n", "Anxiety + Epilepsy results: geneset size 124 - pleitropy rate: 0.04274388142019993\n", "\n", "Arthritis + Epilepsy results: geneset size 122 - pleitropy rate: 0.04205446397793864\n", "\n", "Cataracts + Epilepsy results: geneset size 119 - pleitropy rate: 0.041020337814546705\n", "\n", "Asthma + Epilepsy results: geneset size 86 - pleitropy rate: 0.029644950017235435\n", "\n", "Glaucoma + Epilepsy results: geneset size 48 - pleitropy rate: 0.016546018614270942\n", "\n", "Major Depression + Epilepsy results: geneset size 46 - pleitropy rate: 0.01585660117200965\n", "\n", "Urinary incontinence + Epilepsy results: geneset size 24 - pleitropy rate: 0.008273009307135471\n", "\n", "Peptic ulcers + Epilepsy results: geneset size 21 - pleitropy rate: 0.007238883143743537\n", "\n", "Back pain + Epilepsy results: geneset size 17 - pleitropy rate: 0.005860048259220959\n", "\n", "Pulmonary Disease Chronic Obstructive + Epilepsy results: geneset size 15 - pleitropy rate: 0.005170630816959669\n", "\n", "Fibromyalgia + Epilepsy results: geneset size 10 - pleitropy rate: 0.003447087211306446\n", "\n", "Emphysema + Epilepsy results: geneset size 9 - pleitropy rate: 0.0031023784901758012\n", "\n", "Chronic bronchitis + Epilepsy results: geneset size 2 - pleitropy rate: 0.0006894174422612892\n", "\n" ] } ], "source": [ "# Disease - geneset size pairs\n", "disease_geneset = [\n", " (disease, len(parser_scaiview_csv(path)))\n", " for disease, path in comorbidities\n", "]\n", "\n", "sorted_diseases = sorted(disease_geneset, key=lambda x: x[1], reverse=True)\n", "\n", "# Reference set for epilepsy\n", "epilepsy_geneset = parser_scaiview_csv(epilepsy_path)\n", " \n", "print('Reference Epilepsy geneset size {} \\n'.format(len(epilepsy_geneset)))\n", "\n", "print('Pleitropy rates \\n')\n", "\n", "for disease, geneset in sorted_diseases:\n", " print_results(disease, 'Epilepsy', geneset, len(epilepsy_geneset.keys()))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Overlap between Alzheimer's, migraine and Parkinson's queries" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "alzheimers_epilepsy_geneset = parser_scaiview_csv(os.path.join(resources_path, 'alzheimers_epilepsy.csv'))\n", "migraine_epilepsy_geneset = parser_scaiview_csv(os.path.join(resources_path, 'migraine_epilepsy.csv'))\n", "parkinsons_epilepsy_geneset = parser_scaiview_csv(os.path.join(resources_path, 'parkinson_epilepsy.csv'))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(6, 6))\n", "\n", "v = venn3(\n", " [alzheimers_epilepsy_geneset.keys(), migraine_epilepsy_geneset.keys(), parkinsons_epilepsy_geneset.keys()],\n", " set_labels = (\"Alzheimers's disease\", 'Migraines', \"Parkinson's disease\")\n", ")\n", "plt.title(\"Comorbidity Analysis with Literature-Based Gene Overlap\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Distributions of the geneset relative entropies\n", "\n", "Explanation about the calculation of relative entropies with the following equation can be found in:\n", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3541249/\n", "\n", "$Relative entropy (p_1, p_2) = p_1 * \\log (\\frac{p_1}{p_2})$\n", "\n", "Where $p_1$ is the number of abstracts containing the entity in the query selected corpus and $p_2$ denotes the total number of documents in which the entity occurs within an unspecific reference corpus (i.e. the entire Medline). The Kullback–Leibler divergence ranks those entities high, which have especially high frequency in the selected corpus in comparison to the unspecific reference corpus. This means that frequently occurring entities do not receive high ranks. For example, using the query `\"Alzheimer’s Disease\" AND \"Evidence marker\" AND \"Human Genes/Proteins\"`, we retrieved 331 abstracts containing IL1B with a frequency ranking of 10. Conversely, according to the relative entropy formula, IL1B has an entropy rank of 34 despite its high occurrence in Medline (i.e. 40685 abstracts)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Check the distribution of relative entropies in each query" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Relative entropies as strings to integers\n", "h = [\n", " entropy\n", " for entropy in map(float, alzheimers_epilepsy_geneset.values())\n", "]\n", "\n", "plt.title('Alzheimers + Epilepsy Query')\n", "plt.xlabel('Relative Entropy')\n", "plt.ylabel('Frequency of Relative Entropy')\n", "sns.distplot(h)\n", "plt.show() " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }