Raw File
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Momi2 analysis\n",
    "## Notebook must be run in python3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import momi\n",
    "import glob\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import ipyparallel as ipp\n",
    "import datetime\n",
    "import time\n",
    "import logging\n",
    "import sys\n",
    "import os\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "simout = \"simout\"\n",
    "popfile = \"{}/simpops.txt\".format(simout)\n",
    "\n",
    "logging.basicConfig(level=logging.DEBUG,\n",
    "                    filename=\"momi_log.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [####################] 100% done  "
     ]
    }
   ],
   "source": [
    "def progressbar(njobs, finished, msg=\"\", spacer=\"  \"):\n",
    "    \"\"\" prints a progress bar \"\"\"\n",
    "    if njobs:\n",
    "        progress = 100*(finished / float(njobs))\n",
    "    else:\n",
    "        progress = 100\n",
    "\n",
    "    hashes = '#'*int(progress/5.)\n",
    "    nohash = ' '*int(20-len(hashes))\n",
    "\n",
    "    args = [spacer, hashes+nohash, int(progress), msg]\n",
    "    print(\"\\r{}[{}] {:>3}% {} \".format(*args), end=\"\")\n",
    "    sys.stdout.flush()\n",
    "for i in range(100):\n",
    "    progressbar(100, i, \"watdo\")\n",
    "progressbar(100, 100, \"done\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the popsfile\n",
    "For 2 population models just split the vcf samples in half and assign the first\n",
    "half to pop1, and the second half to pop2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['msp_0', 'msp_1', 'msp_2', 'msp_3', 'msp_4', 'msp_5', 'msp_6', 'msp_7']\n",
      "{'msp_0': 'pop1', 'msp_1': 'pop1', 'msp_2': 'pop1', 'msp_3': 'pop1', 'msp_4': 'pop2', 'msp_5': 'pop2', 'msp_6': 'pop2', 'msp_7': 'pop2'}\n",
      "msp_0\tpop1\r\n",
      "msp_1\tpop1\r\n",
      "msp_2\tpop1\r\n",
      "msp_3\tpop1\r\n",
      "msp_4\tpop2\r\n",
      "msp_5\tpop2\r\n",
      "msp_6\tpop2\r\n",
      "msp_7\tpop2\r\n"
     ]
    }
   ],
   "source": [
    "dat = !head simout/nomigration-sim0.vcf \n",
    "## Relying on the fact that the vcf files 5th line contains the samples\n",
    "## which are indexed as 9 and on\n",
    "samps = dat[5].split()[9:]\n",
    "print(samps)\n",
    "samps_per_pop = int(len(samps)/2)\n",
    "popdict = {x:\"pop1\" for x in samps[:samps_per_pop]}\n",
    "popdict.update({x:\"pop2\" for x in samps[samps_per_pop:]})\n",
    "print(popdict)\n",
    "with open(popfile, 'w') as outfile:\n",
    "    for k,v in popdict.items():\n",
    "        outfile.write(\"{}\\t{}\\n\".format(k,v))\n",
    "!cat simout/simpops.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate sfs files per vcf\n",
    "* **Population assignment file** - This is a tab or space separated list of sample names and population names to which they are assigned. Sample names need to be exactly the same as they are in the VCF file. Population names can be anything, but it’s useful if they’re meaningful.  \n",
    "* **Properly formatted VCF** - We do have the VCF file output from the ipyrad Anolis assembly, but it requires a bit of massaging before it’s ready for momi2. It must be zipped and indexed in such a way as to make it searchable.  \n",
    "* **BED file** - This file specifies genomic regions to include in when calculating the SFS. It is composed of 3 columns which specify ‘chrom’, ‘chromStart’, and ‘chromEnd’.  \n",
    "* **The allele counts file** - The allele counts file is an intermediate file that we must generate on the way to constructing the SFS. momi2 provides a function for this.  \n",
    "* **Genereate the SFS** - The culmination of all this housekeeping is the SFS file which we will use for demographic inference."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 550,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "simout/asymmetric_migration/asymmetric_migration-sim0.vcf\n",
      "simout/asymmetric_migration/asymmetric_migration-sim1.vcf\n",
      "simout/asymmetric_migration/asymmetric_migration-sim2.vcf\n",
      "simout/asymmetric_migration/asymmetric_migration-sim3.vcf\n",
      "simout/asymmetric_migration/asymmetric_migration-sim4.vcf\n",
      "simout/no_migration/no_migration-sim0.vcf\n",
      "simout/no_migration/no_migration-sim1.vcf\n",
      "simout/no_migration/no_migration-sim2.vcf\n",
      "simout/no_migration/no_migration-sim3.vcf\n",
      "simout/no_migration/no_migration-sim4.vcf\n",
      "simout/symmetric_migration/symmetric_migration-sim0.vcf\n",
      "simout/symmetric_migration/symmetric_migration-sim1.vcf\n",
      "simout/symmetric_migration/symmetric_migration-sim2.vcf\n",
      "simout/symmetric_migration/symmetric_migration-sim3.vcf\n",
      "simout/symmetric_migration/symmetric_migration-sim4.vcf\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "for i in `ls simout/*/*.vcf`; \\\n",
    "    do echo $i; \\\n",
    "    bgzip -c $i > $i.gz; \\\n",
    "    tabix $i.gz;\\\n",
    "    python vcf2bed.py $i;\n",
    "    done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 551,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'no_migration': 'simout/no_migration', 'asymmetric_migration': 'simout/asymmetric_migration', 'symmetric_migration': 'simout/symmetric_migration'}\n",
      "{'no_migration': [], 'asymmetric_migration': [], 'symmetric_migration': []}\n",
      "simout/no_migration/no_migration-sim1.vcf.gz\n",
      "simout/no_migration/no_migration-sim4.vcf.gz\n",
      "simout/no_migration/no_migration-sim2.vcf.gz\n",
      "simout/no_migration/no_migration-sim3.vcf.gz\n",
      "simout/no_migration/no_migration-sim0.vcf.gz\n",
      "simout/asymmetric_migration/asymmetric_migration-sim1.vcf.gz\n",
      "simout/asymmetric_migration/asymmetric_migration-sim0.vcf.gz\n",
      "simout/asymmetric_migration/asymmetric_migration-sim3.vcf.gz\n",
      "simout/asymmetric_migration/asymmetric_migration-sim2.vcf.gz\n",
      "simout/asymmetric_migration/asymmetric_migration-sim4.vcf.gz\n",
      "simout/symmetric_migration/symmetric_migration-sim1.vcf.gz\n",
      "simout/symmetric_migration/symmetric_migration-sim4.vcf.gz\n",
      "simout/symmetric_migration/symmetric_migration-sim3.vcf.gz\n",
      "simout/symmetric_migration/symmetric_migration-sim2.vcf.gz\n",
      "simout/symmetric_migration/symmetric_migration-sim0.vcf.gz\n",
      "3\n"
     ]
    }
   ],
   "source": [
    "model_dirs = {x.split(\"/\")[1]:x for x in glob.glob(\"simout/*\") if os.path.isdir(x)}\n",
    "sfs_dict = {x:[] for x in model_dirs}\n",
    "print(model_dirs)\n",
    "print(sfs_dict)\n",
    "\n",
    "for m, d in model_dirs.items():\n",
    "    for f in glob.glob(d + \"/*.vcf.gz\"):\n",
    "        print(f)\n",
    "        allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=f, ind2pop=popdict, bed_file=f.split(\".\")[0]+\".bed\", ancestral_alleles=False)\n",
    "        allele_counts.dump(f.split(\".\")[0]+\"_allele_counts.gz\")\n",
    "        sfs = allele_counts.extract_sfs(n_blocks=None)\n",
    "        sfs_dict[m].append(sfs)\n",
    "        sfs.dump(f.split(\".\")[0]+\".sfs\")\n",
    "print(len(sfs_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/momi/compute_sfs.py:511: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  [slice(None)]],\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/autograd/numpy/numpy_boxes.py:13: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  def __getitem__(A, idx): return A[idx]\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/autograd/numpy/numpy_vjps.py:597: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  onp.add.at(A, idx, x)\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/autograd/numpy/numpy_vjps.py:444: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  return lambda g: g[idxs]\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/momi/compute_sfs.py:458: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  [slice(None)] + [0] * len(b.pop_labels)]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200156.892920796\n",
      "187080.75895088378 210358.50332806082\n"
     ]
    }
   ],
   "source": [
    "def optimize(sfs):\n",
    "    no_migration_model = momi.DemographicModel(N_e=1e5,\n",
    "                                               #muts_per_gen=1e-8,\n",
    "                                               gen_time=1)\n",
    "\n",
    "    no_migration_model.add_time_param(\"tdiv\")\n",
    "\n",
    "    no_migration_model.add_leaf(\"pop1\")\n",
    "    no_migration_model.add_leaf(\"pop2\")\n",
    "    no_migration_model.move_lineages(\"pop1\", \"pop2\", t=\"tdiv\")\n",
    "\n",
    "    no_migration_model.set_data(sfs)\n",
    "\n",
    "    return no_migration_model.optimize()\n",
    "results = []\n",
    "for sfs in sfs_list:\n",
    "    results.append(optimize(sfs))\n",
    "print(np.mean([x.parameters.tdiv for x in results]))\n",
    "print(min([x.parameters.tdiv for x in results]), max([x.parameters.tdiv for x in results]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 470,
   "metadata": {},
   "outputs": [],
   "source": [
    "def optimize(sfs, test_model=\"no_migration\"):\n",
    "    import momi\n",
    "    import pandas as pd\n",
    "    model = momi.DemographicModel(N_e=1e5,\n",
    "                                  #muts_per_gen=1e-8,\n",
    "                                  gen_time=1)\n",
    "\n",
    "    model.add_leaf(\"pop1\")\n",
    "    model.add_leaf(\"pop2\")\n",
    "\n",
    "    ## Move some lineages one way or the other or both\n",
    "    if test_model == \"asymmetric_migration\":\n",
    "        model.add_time_param(\"tmig_pop1_pop2\")\n",
    "        model.add_time_param(\"tmig_pop2_pop1\")\n",
    "        model.add_pulse_param(\"mfrac_pop1_pop2\", upper=.2)\n",
    "        model.add_pulse_param(\"mfrac_pop2_pop1\", upper=.2)\n",
    "        model.move_lineages(\"pop1\", \"pop2\", t=\"tmig_pop1_pop2\", p=\"mfrac_pop1_pop2\")\n",
    "        model.move_lineages(\"pop2\", \"pop1\", t=\"tmig_pop1_pop2\", p=\"mfrac_pop2_pop1\")\n",
    "        model.add_time_param(\"tdiv\", lower_constraints=[\"tmig_pop1_pop2\", \"tmig_pop2_pop1\"])\n",
    "\n",
    "    elif test_model == \"symmetric_migration\":\n",
    "        model.add_pulse_param(\"mfrac\", upper=.2)\n",
    "        model.add_time_param(\"tmig\")\n",
    "        model.move_lineages(\"pop1\", \"pop2\", t=\"tmig\", p=\"mfrac\")\n",
    "        model.move_lineages(\"pop2\", \"pop1\", t=\"tmig\", p=\"mfrac\")\n",
    "        model.add_time_param(\"tdiv\", lower_constraints=[\"tmig\"])\n",
    "    else:\n",
    "        ## No migration\n",
    "        model.add_time_param(\"tdiv\")\n",
    "\n",
    "    ## Divergence event\n",
    "    model.move_lineages(\"pop1\", \"pop2\", t=\"tdiv\")\n",
    "\n",
    "    model.set_data(sfs)\n",
    "\n",
    "    return pd.Series(model.optimize())\n",
    "#results = []\n",
    "#for sfs in sfs_list:\n",
    "#    results.append(optimize(sfs))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run optimization for each model serially\n",
    "Slow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 406,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no_migration no_migration\n",
      "0\n",
      "1\n",
      "2\n",
      "3\n",
      "4\n"
     ]
    }
   ],
   "source": [
    "models = [\"no_migration\", \"symmetric_migration\", \"asymmetric_migration\"]\n",
    "models = sfs_dict.keys()\n",
    "## Results dictionary for keeping track ouf output testing each model on the simulated data\n",
    "res_dict = {x:{y:pd.DataFrame() for y in models} for x in models}\n",
    "for model in models:\n",
    "    for testmodel in models:\n",
    "        print(model, testmodel)\n",
    "        for i, sfs in enumerate(sfs_dict[model]):\n",
    "            print(i),\n",
    "            res = pd.Series(optimize(sfs, test_model=testmodel))\n",
    "            res_dict[model][testmodel] = res_dict[model][testmodel].append(res, ignore_index=True)\n",
    "res_dict[\"no_migration\"][\"no_migration\"][\"log_likelihood\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run optimization for each model in parallel\n",
    "100 loci runs in 30'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 580,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    }
   ],
   "source": [
    "## Run this in a python3 env and wait for clients to attach\n",
    "#!ipcluster start -n 4 --cluster-id=ipp3 --daemonize\n",
    "ipyclient = ipp.Client(cluster_id=\"ipp3\")\n",
    "print(len(ipyclient.ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 566,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [####################] 100%  |  0:00:08  |  no_migration/no_migration   \n",
      "  [####################] 100%  |  0:04:46  |  no_migration/asymmetric_migration   \n",
      "  [####################] 100%  |  0:09:18  |  no_migration/symmetric_migration   \n",
      "  [####################] 100%  |  0:09:31  |  asymmetric_migration/no_migration   \n",
      "  [####################] 100%  |  0:15:29  |  asymmetric_migration/asymmetric_migration   \n",
      "  [####################] 100%  |  0:21:59  |  asymmetric_migration/symmetric_migration   \n",
      "  [####################] 100%  |  0:22:10  |  symmetric_migration/no_migration   \n",
      "  [####################] 100%  |  0:29:57  |  symmetric_migration/asymmetric_migration   \n",
      "  [####################] 100%  |  0:33:23  |  symmetric_migration/symmetric_migration   \n"
     ]
    }
   ],
   "source": [
    "ipyclient[:].use_dill()\n",
    "lbview = ipyclient.load_balanced_view()\n",
    "\n",
    "start = time.time()\n",
    "printstr = \" |  {}  |  {}/{}  \"\n",
    "res_dict = {x:{y:pd.DataFrame() for y in models} for x in models}\n",
    "for model in models:\n",
    "    for testmodel in models:\n",
    "        parallel_jobs = {}\n",
    "        for i, sfs in enumerate(sfs_dict[model]):\n",
    "            parallel_jobs[i] = lbview.apply(optimize, *(sfs, testmodel))\n",
    "\n",
    "        ## Wait for all jobs to finish\n",
    "        while 1:\n",
    "            fin = [i.ready() for i in parallel_jobs.values()]\n",
    "            elapsed = datetime.timedelta(seconds=int(time.time()-start))\n",
    "            progressbar(len(fin), sum(fin), printstr.format(elapsed, model, testmodel))\n",
    "            time.sleep(0.1)\n",
    "            if len(fin) == sum(fin):\n",
    "                print(\"\")\n",
    "                break\n",
    "\n",
    "        ## Fetch results\n",
    "        for job in parallel_jobs:\n",
    "            if parallel_jobs[job].ready():\n",
    "                if not parallel_jobs[job].successful():\n",
    "                    raise Exception((job, parallel_jobs[job].result()))\n",
    "                else:\n",
    "                    res = parallel_jobs[job].result()\n",
    "                    res_dict[model][testmodel] = res_dict[model][testmodel].append(res, ignore_index=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 584,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fun</th>\n",
       "      <th>jac</th>\n",
       "      <th>kl_divergence</th>\n",
       "      <th>log_likelihood</th>\n",
       "      <th>message</th>\n",
       "      <th>nfev</th>\n",
       "      <th>nit</th>\n",
       "      <th>parameters</th>\n",
       "      <th>status</th>\n",
       "      <th>success</th>\n",
       "      <th>x</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.050932</td>\n",
       "      <td>[-4.9400055099086546e-14]</td>\n",
       "      <td>0.050932</td>\n",
       "      <td>-6217.879747</td>\n",
       "      <td>Converged (|f_n-f_(n-1)| ~= 0)</td>\n",
       "      <td>15.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>{'tdiv': 207812.4669740607}</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>[207812.4669740607]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.056832</td>\n",
       "      <td>[-1.5941820301899867e-10]</td>\n",
       "      <td>0.056832</td>\n",
       "      <td>-6536.965831</td>\n",
       "      <td>Local minimum reached (|pg| ~= 0)</td>\n",
       "      <td>14.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>{'tdiv': 194127.098247352}</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>[194127.098247352]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.047111</td>\n",
       "      <td>[-3.688396697440477e-15]</td>\n",
       "      <td>0.047111</td>\n",
       "      <td>-6200.531366</td>\n",
       "      <td>Converged (|f_n-f_(n-1)| ~= 0)</td>\n",
       "      <td>13.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>{'tdiv': 195961.44600195438}</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>[195961.44600195438]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.051390</td>\n",
       "      <td>[-2.1960968638219668e-14]</td>\n",
       "      <td>0.051390</td>\n",
       "      <td>-6016.619226</td>\n",
       "      <td>Converged (|f_n-f_(n-1)| ~= 0)</td>\n",
       "      <td>12.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>{'tdiv': 210044.85760750948}</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>[210044.85760750948]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.050849</td>\n",
       "      <td>[-2.488357827122459e-12]</td>\n",
       "      <td>0.050849</td>\n",
       "      <td>-6285.086090</td>\n",
       "      <td>Local minimum reached (|pg| ~= 0)</td>\n",
       "      <td>12.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>{'tdiv': 206175.80195611055}</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>[206175.80195611055]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        fun                        jac  kl_divergence  log_likelihood  \\\n",
       "0  0.050932  [-4.9400055099086546e-14]       0.050932    -6217.879747   \n",
       "1  0.056832  [-1.5941820301899867e-10]       0.056832    -6536.965831   \n",
       "2  0.047111   [-3.688396697440477e-15]       0.047111    -6200.531366   \n",
       "3  0.051390  [-2.1960968638219668e-14]       0.051390    -6016.619226   \n",
       "4  0.050849   [-2.488357827122459e-12]       0.050849    -6285.086090   \n",
       "\n",
       "                             message  nfev  nit                    parameters  \\\n",
       "0     Converged (|f_n-f_(n-1)| ~= 0)  15.0  5.0   {'tdiv': 207812.4669740607}   \n",
       "1  Local minimum reached (|pg| ~= 0)  14.0  4.0    {'tdiv': 194127.098247352}   \n",
       "2     Converged (|f_n-f_(n-1)| ~= 0)  13.0  4.0  {'tdiv': 195961.44600195438}   \n",
       "3     Converged (|f_n-f_(n-1)| ~= 0)  12.0  4.0  {'tdiv': 210044.85760750948}   \n",
       "4  Local minimum reached (|pg| ~= 0)  12.0  4.0  {'tdiv': 206175.80195611055}   \n",
       "\n",
       "   status  success                     x  \n",
       "0     1.0      1.0   [207812.4669740607]  \n",
       "1     0.0      1.0    [194127.098247352]  \n",
       "2     1.0      1.0  [195961.44600195438]  \n",
       "3     1.0      1.0  [210044.85760750948]  \n",
       "4     0.0      1.0  [206175.80195611055]  "
      ]
     },
     "execution_count": 584,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_dict[\"asymmetric_migration\"][\"no_migration\"]\n",
    "res_dict[\"asymmetric_migration\"][\"asymmetric_migration\"]\n",
    "#res_dict[\"symmetric_migration\"][\"no_migration\"]\n",
    "res_dict[\"no_migration\"][\"no_migration\"]\n",
    "#res_dict[\"no_migration\"][\"symmetric_migration\"][\"parameters\"][1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perform model selection with AIC\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 567,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no_migration no_migration\n",
      "no_migration asymmetric_migration\n",
      "no_migration symmetric_migration\n",
      "Delta AIC per model:  [0.57898907 3.98637385 0.        ]\n",
      "AIC weight per model:  [0.74864188 0.13626048 1.        ]\n",
      "\n",
      "asymmetric_migration no_migration\n",
      "asymmetric_migration asymmetric_migration\n",
      "asymmetric_migration symmetric_migration\n",
      "Delta AIC per model:  [65.31044955  0.         69.31044955]\n",
      "AIC weight per model:  [6.57682180e-15 1.00000000e+00 8.90076041e-16]\n",
      "\n",
      "symmetric_migration no_migration\n",
      "symmetric_migration asymmetric_migration\n",
      "symmetric_migration symmetric_migration\n",
      "Delta AIC per model:  [0.         6.88782933 4.00009967]\n",
      "AIC weight per model:  [1.         0.03193941 0.13532854]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for model in models:\n",
    "    AICs = []\n",
    "    for testmodel in models:\n",
    "        print(model, testmodel)\n",
    "        lik = res_dict[model][testmodel][\"log_likelihood\"]\n",
    "        nparams = len(res_dict[model][testmodel][\"parameters\"][0])\n",
    "        aics = 2*nparams - 2*lik\n",
    "        #print(aics, lik)\n",
    "        AICs.append(np.min(aics))\n",
    "    minv = np.min(AICs)\n",
    "    delta_aic = np.array(AICs) - minv\n",
    "    print(\"Delta AIC per model: \", delta_aic)\n",
    "    print(\"AIC weight per model: \", np.exp(-0.5 * delta_aic))\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Crap below here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 548,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "91"
      ]
     },
     "execution_count": 548,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sfs = sfs_dict[\"no_migration\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 592,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg pairwise heterozygosity [[0.23809524 0.        ]\n",
      " [0.28571429 0.5952381 ]\n",
      " [0.21428571 0.21428571]\n",
      " [0.         1.10714286]\n",
      " [0.         0.75      ]]\n",
      "populations ('pop1', 'pop2')\n",
      "percent missing data per population [0.26984127 0.192     ]\n"
     ]
    }
   ],
   "source": [
    "!python -m momi.extract_sfs tmp_sfs.gz 10 simout/tmp_allele_counts.gz\n",
    "sfs = momi.Sfs.load(\"tmp_sfs.gz\")\n",
    "print(\"Avg pairwise heterozygosity\", sfs.avg_pairwise_hets[:5])\n",
    "print(\"populations\", sfs.populations)\n",
    "print(\"percent missing data per population\", sfs.p_missing)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "simout/nomigration-sim2.vcf.gz\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/momi/compute_sfs.py:511: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  [slice(None)]],\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/autograd/numpy/numpy_boxes.py:13: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  def __getitem__(A, idx): return A[idx]\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/autograd/numpy/numpy_vjps.py:597: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  onp.add.at(A, idx, x)\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/autograd/numpy/numpy_vjps.py:444: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  return lambda g: g[idxs]\n",
      "/home/isaac/miniconda2/envs/momi-py36/lib/python3.6/site-packages/momi/compute_sfs.py:458: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  [slice(None)] + [0] * len(b.pop_labels)]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "percent missing data per population [0.28494548 0.27241363] 2717.0 928 194095.84240640234\n",
      "percent missing data per population [0.13303507 0.12346818] 2625.0 926 194340.95056568255\n",
      "percent missing data per population [0.03833145 0.03034077] 2358.0 899 194479.44609429862\n",
      "percent missing data per population [0.02972925 0.02272324] 2071.0 867 196573.04078462542\n",
      "percent missing data per population [0.00491746 0.000693  ] 1717.0 796 192115.485457603\n"
     ]
    }
   ],
   "source": [
    "print(f)\n",
    "allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=f, ind2pop=popdict, bed_file=f.split(\".\")[0]+\".bed\", ancestral_alleles=False)\n",
    "allele_counts.dump(f.split(\".\")[0]+\"_allele_counts.gz\")\n",
    "sfs = allele_counts.extract_sfs(n_blocks=None)\n",
    "res = optimize(sfs)\n",
    "print(\"percent missing data per population\", sfs.p_missing, sfs.n_snps(), sfs.n_loci, res.parameters.tdiv)\n",
    "subdict = {\"pop1\":6, \"pop2\":6}\n",
    "allele_counts = allele_counts.down_sample(subdict)\n",
    "sfs = allele_counts.extract_sfs(n_blocks=None)\n",
    "res = optimize(sfs)\n",
    "print(\"percent missing data per population\", sfs.p_missing, sfs.n_snps(), sfs.n_loci, res.parameters.tdiv)\n",
    "\n",
    "subdict = {\"pop1\":4, \"pop2\":4}\n",
    "allele_counts = allele_counts.down_sample(subdict)\n",
    "sfs = allele_counts.extract_sfs(n_blocks=None)\n",
    "res = optimize(sfs)\n",
    "print(\"percent missing data per population\", sfs.p_missing, sfs.n_snps(), sfs.n_loci, res.parameters.tdiv)\n",
    "\n",
    "subdict = {\"pop1\":3, \"pop2\":3}\n",
    "allele_counts = allele_counts.down_sample(subdict)\n",
    "sfs = allele_counts.extract_sfs(n_blocks=None)\n",
    "res = optimize(sfs)\n",
    "print(\"percent missing data per population\", sfs.p_missing, sfs.n_snps(), sfs.n_loci, res.parameters.tdiv)\n",
    "\n",
    "subdict = {\"pop1\":2, \"pop2\":2}\n",
    "allele_counts = allele_counts.down_sample(subdict)\n",
    "sfs = allele_counts.extract_sfs(n_blocks=None)\n",
    "res = optimize(sfs)\n",
    "print(\"percent missing data per population\", sfs.p_missing, sfs.n_snps(), sfs.n_loci, res.parameters.tdiv)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 2])"
      ]
     },
     "execution_count": 187,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "allele_counts\n",
    "sfs.sampled_n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[[3, 1],\n",
       "        [6, 0]],\n",
       "\n",
       "       [[2, 2],\n",
       "        [6, 0]],\n",
       "\n",
       "       [[2, 2],\n",
       "        [4, 2]],\n",
       "\n",
       "       [[4, 0],\n",
       "        [4, 4]],\n",
       "\n",
       "       [[3, 1],\n",
       "        [1, 7]],\n",
       "\n",
       "       [[4, 2],\n",
       "        [0, 2]],\n",
       "\n",
       "       [[6, 0],\n",
       "        [3, 1]],\n",
       "\n",
       "       [[6, 0],\n",
       "        [3, 5]],\n",
       "\n",
       "       [[5, 1],\n",
       "        [8, 0]],\n",
       "\n",
       "       [[5, 1],\n",
       "        [0, 8]],\n",
       "\n",
       "       [[8, 0],\n",
       "        [1, 3]],\n",
       "\n",
       "       [[8, 0],\n",
       "        [7, 1]],\n",
       "\n",
       "       [[8, 0],\n",
       "        [5, 3]],\n",
       "\n",
       "       [[8, 0],\n",
       "        [0, 8]]])"
      ]
     },
     "execution_count": 198,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sfs = momi.Sfs.load(\"simout/tmp.sfs\")\n",
    "sfs.config_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "skewed\n",
      "0D = 15\n",
      "1D = 1.3827545055689163 / 3.9858656094059413\n",
      "2D = 2.176857585139319\n",
      "even\n",
      "0D = 15\n",
      "1D = 2.70805020110221 / 15.0\n",
      "2D = 15.0\n",
      "highly skewed\n",
      "0D = 9\n",
      "1D = 1.6111578173439172 / 5.008606923972825\n",
      "2D = 2.999999999999999\n"
     ]
    }
   ],
   "source": [
    "from scipy.stats import entropy\n",
    "skewed_abunds = np.array([100, 10, 10, 8, 6, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1])\n",
    "even_abunds = np.array([10] * 15)\n",
    "\n",
    "\n",
    "def diversity_indices(abunds):\n",
    "    print(\"0D = {}\".format(len(abunds)))\n",
    "    print(\"1D = {} / {}\".format(entropy(abunds), np.exp(entropy(abunds))))\n",
    "    proportions = abunds/np.sum(abunds)\n",
    "    squared_proportions = proportions * proportions\n",
    "    print(\"2D = {}\".format(1/np.sum(squared_proportions)))\n",
    "\n",
    "print(\"skewed\")\n",
    "diversity_indices(skewed_abunds)\n",
    "print(\"even\")\n",
    "diversity_indices(even_abunds)\n",
    "print(\"highly skewed\")\n",
    "diversity_indices([10, 1, 1, 1, 1, 1, 1, 1, 1])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 393,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "0",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m~/miniconda2/envs/momi-py36/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m   3077\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3078\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3079\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 0",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-393-0015cb4a933d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#print(res_dict[\"no_migration\"][\"no_migration\"])\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"no_migration\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda2/envs/momi-py36/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2686\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2687\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2688\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2689\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2690\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda2/envs/momi-py36/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_getitem_column\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2693\u001b[0m         \u001b[0;31m# get column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2694\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_unique\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2695\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_item_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2697\u001b[0m         \u001b[0;31m# duplicate columns & possible reduce dimensionality\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda2/envs/momi-py36/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_item_cache\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m   2487\u001b[0m         \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2488\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2489\u001b[0;31m             \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2490\u001b[0m             \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_box_item_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2491\u001b[0m             \u001b[0mcache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda2/envs/momi-py36/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, item, fastpath)\u001b[0m\n\u001b[1;32m   4113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4114\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4115\u001b[0;31m                 \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4116\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4117\u001b[0m                 \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda2/envs/momi-py36/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m   3078\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3079\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3080\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3081\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3082\u001b[0m         \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 0"
     ]
    }
   ],
   "source": [
    "#print(res_dict[\"no_migration\"][\"no_migration\"])\n",
    "print(dict(res_dict[\"no_migration\"][\"no_migration\"][0]))\n",
    "print(pd.DataFrame.from_dict(res_dict[\"no_migration\"][\"no_migration\"][0]))\n",
    "df = pd.DataFrame.from_dict(res_dict[\"no_migration\"][\"no_migration\"])\n",
    "pd.concat([df, pd.Series(res_dict[\"no_migration\"][\"no_migration\"][0])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 401,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fun</th>\n",
       "      <th>jac</th>\n",
       "      <th>kl_divergence</th>\n",
       "      <th>log_likelihood</th>\n",
       "      <th>message</th>\n",
       "      <th>nfev</th>\n",
       "      <th>nit</th>\n",
       "      <th>parameters</th>\n",
       "      <th>status</th>\n",
       "      <th>success</th>\n",
       "      <th>x</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.347906</td>\n",
       "      <td>[-1.3436429009052591e-11]</td>\n",
       "      <td>0.347906</td>\n",
       "      <td>-728.09226</td>\n",
       "      <td>Local minimum reached (|pg| ~= 0)</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>{'tdiv': 171637.84925481293}</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>[171637.84925481293]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        fun                        jac  kl_divergence  log_likelihood  \\\n",
       "0  0.347906  [-1.3436429009052591e-11]       0.347906      -728.09226   \n",
       "\n",
       "                             message  nfev  nit                    parameters  \\\n",
       "0  Local minimum reached (|pg| ~= 0)  10.0  3.0  {'tdiv': 171637.84925481293}   \n",
       "\n",
       "   status  success                     x  \n",
       "0     0.0      1.0  [171637.84925481293]  "
      ]
     },
     "execution_count": 401,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_dict[\"no_migration\"][\"no_migration\"].append(pd.Series(res), ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!bgzip -c 100loci.vcf > 100loci.vcf.gz\n",
    "!tabix 100loci.vcf.gz\n",
    "!python vcf2bed.py 100loci.vcf\n",
    "## Has to be run in python2\n",
    "#!../easySFS.py -i 100loci.vcf -p simout/simpops.txt --proj 8,8 -o 100_folded\n",
    "#!../easySFS.py -i 100loci.vcf -p simout/simpops.txt --proj 8,8 --unfolded -o 100_unfolded\n",
    "folded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"1000loci.vcf.gz\", ind2pop=popdict, bed_file=\"1000loci.bed\", ancestral_alleles=False)\n",
    "unfolded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"1000loci.vcf.gz\", ind2pop=popdict, bed_file=\"1000loci.bed\", ancestral_alleles=True)\n",
    "momi_folded_sfs = folded_allele_counts.extract_sfs(n_blocks=1)\n",
    "momi_unfolded_sfs = unfolded_allele_counts.extract_sfs(n_blocks=1)\n",
    "momi_folded_sfs = fold(momi_unfolded_sfs)\n",
    "\n",
    "my_folded_sfs = momi.Sfs.load(\"./1000_folded/dadi/pop1-pop2_momi.sfs\")\n",
    "my_unfolded_sfs = momi.Sfs.load(\"./1000_unfolded/dadi/pop1-pop2_momi.sfs\")\n",
    "momi_dict = momi_folded_sfs.to_dict()\n",
    "my_dict = my_folded_sfs.to_dict()\n",
    "\n",
    "print(momi_folded_sfs == my_folded_sfs)\n",
    "print(momi_unfolded_sfs == my_unfolded_sfs)\n",
    "print(momi_folded_sfs.n_snps(), my_folded_sfs.n_snps())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "((8, 0), (7, 1)) 185.0 185.0\n",
      "((8, 0), (6, 2)) 66.0 66.0\n",
      "((8, 0), (5, 3)) 48.0 48.0\n",
      "((8, 0), (4, 4)) 32.0 32.0\n",
      "((8, 0), (3, 5)) 28.0 28.0\n",
      "((8, 0), (2, 6)) 23.0 23.0\n",
      "((8, 0), (1, 7)) 18.0 18.0\n",
      "((8, 0), (0, 8)) 73.0 37.0\n",
      "((7, 1), (8, 0)) 185.0 185.0\n",
      "((7, 1), (7, 1)) 2.0 2.0\n",
      "((7, 1), (5, 3)) 2.0 2.0\n",
      "((7, 1), (4, 4)) 3.0 3.0\n",
      "((7, 1), (3, 5)) 4.0 4.0\n",
      "((7, 1), (2, 6)) 1.0 1.0\n",
      "((7, 1), (0, 8)) 17.0 17.0\n",
      "((6, 2), (8, 0)) 87.0 87.0\n",
      "((6, 2), (7, 1)) 1.0 1.0\n",
      "((6, 2), (6, 2)) 1.0 1.0\n",
      "((6, 2), (4, 4)) 2.0 2.0\n",
      "((6, 2), (3, 5)) 2.0 2.0\n",
      "((6, 2), (2, 6)) 1.0 1.0\n",
      "((6, 2), (1, 7)) 4.0 4.0\n",
      "((6, 2), (0, 8)) 23.0 23.0\n",
      "((5, 3), (8, 0)) 50.0 50.0\n",
      "((5, 3), (5, 3)) 1.0 1.0\n",
      "((5, 3), (4, 4)) 2.0 2.0\n",
      "((5, 3), (3, 5)) 1.0 1.0\n",
      "((5, 3), (2, 6)) 2.0 2.0\n",
      "((5, 3), (1, 7)) 1.0 1.0\n",
      "((5, 3), (0, 8)) 38.0 38.0\n",
      "((4, 4), (8, 0)) 32.0 32.0\n",
      "((4, 4), (7, 1)) 2.0 2.0\n",
      "((4, 4), (6, 2)) 1.0 1.0\n",
      "\n",
      "((7, 1), (5, 3)) 2.0 2.0\n",
      "((5, 3), (8, 0)) 50.0 50.0\n",
      "((8, 0), (0, 8)) 37.0 73.0\n",
      "((8, 0), (1, 7)) 18.0 18.0\n",
      "((6, 2), (2, 6)) 1.0 1.0\n",
      "((7, 1), (2, 6)) 1.0 1.0\n",
      "((5, 3), (1, 7)) 1.0 1.0\n",
      "((6, 2), (3, 5)) 2.0 2.0\n",
      "((8, 0), (5, 3)) 48.0 48.0\n",
      "((4, 4), (8, 0)) 32.0 32.0\n",
      "((5, 3), (5, 3)) 1.0 1.0\n",
      "((8, 0), (2, 6)) 23.0 23.0\n",
      "((7, 1), (3, 5)) 4.0 4.0\n",
      "((5, 3), (0, 8)) 38.0 38.0\n",
      "((6, 2), (6, 2)) 1.0 1.0\n",
      "((5, 3), (3, 5)) 1.0 1.0\n",
      "((4, 4), (7, 1)) 2.0 2.0\n",
      "((7, 1), (7, 1)) 2.0 2.0\n",
      "((8, 0), (4, 4)) 32.0 32.0\n",
      "((6, 2), (0, 8)) 23.0 23.0\n",
      "((7, 1), (4, 4)) 3.0 3.0\n",
      "((8, 0), (3, 5)) 28.0 28.0\n",
      "((6, 2), (7, 1)) 1.0 1.0\n",
      "((4, 4), (6, 2)) 1.0 1.0\n",
      "((7, 1), (8, 0)) 185.0 185.0\n",
      "((8, 0), (7, 1)) 185.0 185.0\n",
      "((7, 1), (0, 8)) 17.0 17.0\n",
      "((5, 3), (2, 6)) 2.0 2.0\n",
      "((6, 2), (1, 7)) 4.0 4.0\n",
      "((8, 0), (6, 2)) 66.0 66.0\n",
      "((6, 2), (8, 0)) 87.0 87.0\n",
      "((5, 3), (4, 4)) 2.0 2.0\n",
      "((6, 2), (4, 4)) 2.0 2.0\n",
      "938.0\n",
      "902.0\n"
     ]
    }
   ],
   "source": [
    "for k,v in momi_dict.items():\n",
    "    try:\n",
    "        print(k, momi_dict[k], my_dict[k])\n",
    "    except:\n",
    "        print(\"Not in dadi {} - {}\".format(k, momi_dict[k]))\n",
    "print()\n",
    "for k,v in my_dict.items():\n",
    "    try:\n",
    "        print(k, my_dict[k], momi_dict[k])\n",
    "    except:\n",
    "        print(\"Not in momi {} - {}\".format(k, my_dict[k]))\n",
    "print(np.sum([x for x in momi_dict.values()]))\n",
    "print(np.sum([x for x in my_dict.values()]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 664,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['[[0, 8], [3, 5]]', '[[0, 8], [8, 0]]', '[[1, 7], [8, 0]]', '[[2, 6], [8, 0]]', '[[3, 5], [8, 0]]', '[[4, 4], [8, 0]]', '[[5, 3], [8, 0]]', '[[6, 2], [0, 8]]', '[[6, 2], [2, 6]]', '[[6, 2], [3, 5]]', '[[6, 2], [8, 0]]', '[[7, 1], [3, 5]]', '[[7, 1], [8, 0]]', '[[8, 0], [0, 8]]', '[[8, 0], [1, 7]]', '[[8, 0], [2, 6]]', '[[8, 0], [3, 5]]', '[[8, 0], [4, 4]]', '[[8, 0], [5, 3]]', '[[8, 0], [6, 2]]', '[[8, 0], [7, 1]]']\n",
      "['[[0, 8], [3, 5]]', '[[0, 8], [8, 0]]', '[[1, 7], [8, 0]]', '[[2, 6], [8, 0]]', '[[3, 5], [8, 0]]', '[[4, 4], [8, 0]]', '[[5, 3], [8, 0]]', '[[6, 2], [0, 8]]', '[[6, 2], [2, 6]]', '[[6, 2], [3, 5]]', '[[6, 2], [8, 0]]', '[[7, 1], [3, 5]]', '[[7, 1], [8, 0]]', '[[8, 0], [0, 8]]', '[[8, 0], [1, 7]]', '[[8, 0], [2, 6]]', '[[8, 0], [3, 5]]', '[[8, 0], [4, 4]]', '[[8, 0], [5, 3]]', '[[8, 0], [6, 2]]', '[[8, 0], [7, 1]]']\n",
      "\n",
      "set()\n",
      "set()\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 664,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "momi_sfs = momi.Sfs.load(\"unfolded/dadi/momi_unfolded.sfs\")\n",
    "my_sfs = momi.Sfs.load(\"unfolded/dadi/pop1-pop2_momi.sfs\")\n",
    "x = [str(x.tolist()) for x in list(momi_sfs.configs)]\n",
    "y = [str(x.tolist()) for x in list(my_sfs.configs)]\n",
    "print(sorted(x))\n",
    "print(sorted(y))\n",
    "print()\n",
    "print(set(x).difference(set(y)))\n",
    "print(set(y).difference(set(x)))\n",
    "#set(list(momi_sfs.configs)).difference(set(list(my_sfs.configs)))\n",
    "my_sfs.dump(\"/tmp/mysfs.sfs\")\n",
    "momi_dict = momi_sfs.to_dict()\n",
    "my_dict = my_sfs.to_dict()\n",
    "my_sfs == momi_sfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 696,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(popdict)\n",
    "allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"25loci.vcf.gz\", ind2pop=popdict, bed_file=\"25loci.bed\", ancestral_alleles=True)\n",
    "unfolded_sfs = allele_counts.extract_sfs(n_blocks=1)\n",
    "unfolded_sfs.dump(\"momi_unfolded.sfs\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 701,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "both less than [0 1] [8 7]\n",
      "folding from [0 8] [1 7]\n",
      "folding to [8 0] [7 1]\n",
      "both less than [1 8] [7 0]\n",
      "folding from [1 7] [8 0]\n",
      "folding to [7 1] [0 8]\n",
      "both less than [3 8] [5 0]\n",
      "folding from [3 5] [8 0]\n",
      "folding to [5 3] [0 8]\n"
     ]
    }
   ],
   "source": [
    "def get_folded(config):\n",
    "    if tuple(config[:, 0]) < tuple(config[:, 1]):\n",
    "        print(\"both less than {} {}\".format(config[:, 0], config[:, 1]))\n",
    "        print(\"folding from {} {}\".format(config[0], config[1]))\n",
    "        print(\"folding to {} {}\".format(config[:, ::-1][0], config[:, ::-1][1]))\n",
    "        return config[:, ::-1]\n",
    "    else:\n",
    "        return config\n",
    "for config in unfolded_sfs.configs:\n",
    "    folded = get_folded(config)\n",
    "    #print(\"{} {}\\t{} {}\".format(config[0], config[1], folded[0], folded[1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Investigating differences between momi and dadi folding strategies\n",
    "# Before"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "!bgzip -c 25loci.vcf > 25loci.vcf.gz\n",
    "!tabix 25loci.vcf.gz\n",
    "!python vcf2bed.py 25loci.vcf\n",
    "\n",
    "folded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"25loci.vcf.gz\", ind2pop=popdict, bed_file=\"25loci.bed\", ancestral_alleles=False)\n",
    "unfolded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"25loci.vcf.gz\", ind2pop=popdict, bed_file=\"25loci.bed\", ancestral_alleles=True)\n",
    "momi_folded_sfs = folded_allele_counts.extract_sfs(n_blocks=1)\n",
    "momi_unfolded_sfs = unfolded_allele_counts.extract_sfs(n_blocks=1)\n",
    "\n",
    "my_folded_sfs = momi.Sfs.load(\"./25_folded/dadi/pop1-pop2_momi.sfs\")\n",
    "my_unfolded_sfs = momi.Sfs.load(\"./25_unfolded/dadi/pop1-pop2_momi.sfs\")\n",
    "\n",
    "print(momi_folded_sfs == my_folded_sfs)\n",
    "print(momi_unfolded_sfs == my_unfolded_sfs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# After"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "watdo\n",
      "Folding [[0 8]\n",
      " [1 7]] to [[8 0]\n",
      " [7 1]]\n",
      "False\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "!bgzip -c 25loci.vcf > 25loci.vcf.gz\n",
    "!tabix 25loci.vcf.gz\n",
    "!python vcf2bed.py 25loci.vcf\n",
    "\n",
    "folded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"25loci.vcf.gz\", ind2pop=popdict, bed_file=\"25loci.bed\", ancestral_alleles=False)\n",
    "unfolded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"25loci.vcf.gz\", ind2pop=popdict, bed_file=\"25loci.bed\", ancestral_alleles=True)\n",
    "momi_folded_sfs = folded_allele_counts.extract_sfs(n_blocks=1)\n",
    "momi_unfolded_sfs = unfolded_allele_counts.extract_sfs(n_blocks=1)\n",
    "\n",
    "my_folded_sfs = momi.Sfs.load(\"./25_folded/dadi/pop1-pop2_momi.sfs\")\n",
    "my_unfolded_sfs = momi.Sfs.load(\"./25_unfolded/dadi/pop1-pop2_momi.sfs\")\n",
    "\n",
    "print(momi_folded_sfs == my_folded_sfs)\n",
    "print(momi_unfolded_sfs == my_unfolded_sfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['[[1, 7], [8, 0]]', '[[3, 5], [8, 0]]', '[[4, 4], [6, 2]]', '[[4, 4], [8, 0]]', '[[5, 3], [8, 0]]', '[[6, 2], [0, 8]]', '[[6, 2], [8, 0]]', '[[7, 1], [8, 0]]', '[[8, 0], [1, 7]]', '[[8, 0], [6, 2]]', '[[8, 0], [7, 1]]']\n",
      "['[[4, 4], [6, 2]]', '[[4, 4], [8, 0]]', '[[5, 3], [0, 8]]', '[[5, 3], [8, 0]]', '[[6, 2], [0, 8]]', '[[6, 2], [8, 0]]', '[[7, 1], [0, 8]]', '[[7, 1], [8, 0]]', '[[8, 0], [1, 7]]', '[[8, 0], [6, 2]]', '[[8, 0], [7, 1]]']\n",
      "\n",
      "{'[[3, 5], [8, 0]]', '[[1, 7], [8, 0]]'}\n",
      "{'[[7, 1], [0, 8]]', '[[5, 3], [0, 8]]'}\n"
     ]
    }
   ],
   "source": [
    "x = [str(x.tolist()) for x in list(momi_folded_sfs.configs)]\n",
    "y = [str(x.tolist()) for x in list(my_folded_sfs.configs)]\n",
    "print(sorted(x))\n",
    "print(sorted(y))\n",
    "print()\n",
    "print(set(x).difference(set(y)))\n",
    "print(set(y).difference(set(x)))\n",
    "momi_folded_dict = momi_folded_sfs.to_dict()\n",
    "my_folded_dict = my_folded_sfs.to_dict()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "((8, 0), (7, 1)) 2.0 2.0\n",
      "((8, 0), (6, 2)) 5.0 5.0\n",
      "((8, 0), (1, 7)) 1.0 1.0\n",
      "((7, 1), (8, 0)) 8.0 8.0\n",
      "Not in my ((1, 7), (8, 0)) - 1.0\n",
      "((6, 2), (8, 0)) 1.0 1.0\n",
      "((6, 2), (0, 8)) 1.0 1.0\n",
      "((5, 3), (8, 0)) 2.0 2.0\n",
      "Not in my ((3, 5), (8, 0)) - 1.0\n",
      "((4, 4), (8, 0)) 1.0 1.0\n",
      "((4, 4), (6, 2)) 1.0 1.0\n",
      "\n",
      "((8, 0), (7, 1)) 2.0 2.0\n",
      "Not in momi ((7, 1), (0, 8)) - 1.0\n",
      "((5, 3), (8, 0)) 2.0 2.0\n",
      "((6, 2), (0, 8)) 1.0 1.0\n",
      "((8, 0), (1, 7)) 1.0 1.0\n",
      "((6, 2), (8, 0)) 1.0 1.0\n",
      "Not in momi ((5, 3), (0, 8)) - 1.0\n",
      "((4, 4), (6, 2)) 1.0 1.0\n",
      "((7, 1), (8, 0)) 8.0 8.0\n",
      "((8, 0), (6, 2)) 5.0 5.0\n",
      "((4, 4), (8, 0)) 1.0 1.0\n"
     ]
    }
   ],
   "source": [
    "for k,v in momi_folded_dict.items():\n",
    "    try:\n",
    "        print(k, momi_folded_dict[k], my_folded_dict[k])\n",
    "    except:\n",
    "        print(\"Not in my {} - {}\".format(k, momi_folded_dict[k]))\n",
    "print()\n",
    "for k,v in my_folded_dict.items():\n",
    "    try:\n",
    "        print(k, my_folded_dict[k], momi_folded_dict[k])\n",
    "    except:\n",
    "        print(\"Not in momi {} - {}\".format(k, my_folded_dict[k]))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tup1 = tuple([2, 8])\n",
    "tup2 = tuple([6, 0])\n",
    "print(tup1 < tup2)\n",
    "np.all(np.array(tup1) < np.array(tup2))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3 0] [5 8]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[5, 3],\n",
       "       [8, 0]])"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_folded(config):\n",
    "    print(config[:, 0], config[:, 1])\n",
    "\n",
    "    if tuple(config[:, 0]) < tuple(config[:, 1]):\n",
    "#    if np.all(np.array(config[:, 0]) < np.array(config[:, 1])):\n",
    "#    if np.sum(config[:, 1]) > np.sum([8, 8])/2.:\n",
    "        #print(config[:, 0], config[:, 1])\n",
    "        return config[:, ::-1]\n",
    "    else:\n",
    "        return config\n",
    "#get_folded(np.array([[2, 6], [8, 0]]))\n",
    "#get_folded(np.array([[6, 2], [0, 8]]))\n",
    "#get_folded(np.array([[0, 8], [1, 7]]))\n",
    "get_folded(np.array([[3, 5], [0, 8]]))\n",
    "#get_folded(np.array([[3, 5], [3, 5]]))\n",
    "#get_folded(np.array([[8, 0], [0, 8]]))\n",
    "#get_folded(np.array([[0, 8], [8, 0]]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'    def fold(self):\\n        \"\"\"\\n        :returns: A copy of the SFS, but with folded entries.\\n        :rtype: :class:`Sfs`\\n        \"\"\"\\n        print(\"watdo\")\\n        def get_folded(config):\\n            if np.all(np.array(config[:, 0]) < np.array(config[:, 1])):\\n                print(\"Folding {} to {}\".format(config, config[:, ::-1]))\\n                return config[:, ::-1]\\n            else:\\n                return config\\n\\n        compressed_folded = CompressedAlleleCounts.from_iter(\\n            map(get_folded, self.configs),\\n            len(self.sampled_pops), sort=False)\\n\\n        return self.from_matrix(\\n            compressed_folded.index2uniq_mat @ self.freqs_matrix,\\n            ConfigList(self.sampled_pops, compressed_folded.config_array,\\n                        sampled_n=self.sampled_n,\\n                        ascertainment_pop=self.ascertainment_pop),\\n            folded=True, length=self._length)\\n'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import inspect\n",
    "inspect.getsource(momi.Sfs.fold)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import momi\n",
    "#from momi.data import convert\n",
    "#momi.data.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "97.0"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import momi\n",
    "from momi.data import convert\n",
    "\n",
    "\n",
    "unfolded = \"/home/isaac/easySFS/jupyter-notebooks/100_unfolded/dadi/pop1-pop2.sfs\"\n",
    "folded = \"/home/isaac/easySFS/jupyter-notebooks/100_folded/dadi/pop1-pop2.sfs\"\n",
    "sfs = convert.sfs_from_dadi(unfolded)\n",
    "sfs.n_snps()\n",
    "sfs = convert.sfs_from_dadi(folded)\n",
    "sfs.n_snps()\n",
    "resamp = sfs.resample()\n",
    "resamp.n_snps()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['msp_0', 'msp_1', 'msp_2', 'msp_3', 'msp_4', 'msp_5', 'msp_6', 'msp_7']\n"
     ]
    }
   ],
   "source": [
    "import momi\n",
    "from momi.data import convert\n",
    "\n",
    "vcf = \"/home/isaac/easySFS/jupyter-notebooks/100loci.vcf\"\n",
    "\n",
    "dat = open(vcf).readlines()[:10]\n",
    "## Relying on the fact that the vcf files 5th line contains the samples\n",
    "## which are indexed as 9 and on\n",
    "samps = dat[5].split()[9:]\n",
    "print(samps)\n",
    "samps_per_pop = int(len(samps)/2)\n",
    "popdict = {x:\"pop1\" for x in samps[:samps_per_pop]}\n",
    "popdict.update({x:\"pop2\" for x in samps[samps_per_pop:]})\n",
    "\n",
    "unfolded = \"/home/isaac/easySFS/jupyter-notebooks/1000_unfolded/dadi/pop1-pop2.sfs\"\n",
    "folded = \"/home/isaac/easySFS/jupyter-notebooks/1000_folded/dadi/pop1-pop2.sfs\"\n",
    "my_folded = convert.sfs_from_dadi(folded)\n",
    "my_unfolded = convert.sfs_from_dadi(unfolded)\n",
    "\n",
    "folded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"1000loci.vcf.gz\", ind2pop=popdict, bed_file=\"1000loci.bed\", ancestral_alleles=False)\n",
    "unfolded_allele_counts = momi.SnpAlleleCounts.read_vcf(vcf_file=\"1000loci.vcf.gz\", ind2pop=popdict, bed_file=\"1000loci.bed\", ancestral_alleles=True)\n",
    "momi_folded_sfs = folded_allele_counts.extract_sfs(n_blocks=1)\n",
    "momi_unfolded_sfs = unfolded_allele_counts.extract_sfs(n_blocks=1)\n",
    "\n",
    "print(my_folded == momi_folded_sfs)\n",
    "print(my_unfolded == momi_unfolded_sfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "938.0\n",
      "938.0\n"
     ]
    }
   ],
   "source": [
    "print(my_folded.n_snps())\n",
    "print(my_unfolded.n_snps())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
back to top