We are hiring ! See our job offers.
Raw File
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Useful msprime docs:\n",
    "https://github.com/jeromekelleher/spg-chapter/blob/master/jupyter/msprime-chapter-examples.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 289,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from __future__ import print_function\n",
    "import msprime\n",
    "import matplotlib.pyplot as plt\n",
    "import ipyrad.analysis as ipa\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import ipyparallel as ipp\n",
    "import datetime\n",
    "import time\n",
    "import sys\n",
    "import os\n",
    "from tempfile import TemporaryFile\n",
    "from IPython.display import SVG, display\n",
    "\n",
    "simout = \"simout\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [####################] 100% done  "
     ]
    }
   ],
   "source": [
    "def progressbar(njobs, finished, msg=\"\", spacer=\"  \"):\n",
    "    \"\"\" prints a progress bar \"\"\"\n",
    "    if njobs:\n",
    "        progress = 100*(finished / float(njobs))\n",
    "    else:\n",
    "        progress = 100\n",
    "\n",
    "    hashes = '#'*int(progress/5.)\n",
    "    nohash = ' '*int(20-len(hashes))\n",
    "\n",
    "    args = [spacer, hashes+nohash, int(progress), msg]\n",
    "    print(\"\\r{}[{}] {:>3}% {} \".format(*args), end=\"\")\n",
    "    sys.stdout.flush()\n",
    "\n",
    "for i in range(100):\n",
    "    progressbar(100, i, \"watdo\")\n",
    "progressbar(100, 100, \"done\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  INFO: # PCs < # samples. Forcing # PCs = 8\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f82212d6cd0>"
      ]
     },
     "execution_count": 282,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAagAAAFgCAYAAADuCe0ZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHwRJREFUeJzt3Xt0VeWd//HPNwkeICi3xCKBmjOYCIgXICDWdgZMabW1aqvTVrBj26mX36oTFMW26dBOS83SUlHzm479eWlnykVr661Vi0Vqq04n2qBVQYQWE5VbTRgMNKVHEr6/P85JPMTgAcnJfpLzfq3FWsneJ3t/uOWTZ+/n7MfcXQAAhCYv6gAAAHSHggIABImCAgAEiYICAASJggIABImCAgAEiYICAASJggIABImCAgAEqSDqAIeiqKjIS0tLo44BoA9bs2ZNs7sXR50DmfWpgiotLVV9fX3UMQD0YWb2atQZcHC4xAcACBIFBQAIEgUFAAhSn7oHBQDZsGbNmqMLCgrukDRJ/ODem/ZJWtvW1valqVOnvtF1JwUFIOcVFBTcMWrUqAnFxcU78/LyWCSvl+zbt8+ampombt++/Q5J53Tdz08KACBNKi4u3kU59a68vDwvLi5uUXLk+s79vZwHAEKURzlFI/Xn3m0XUVAAgCBRUABwiDasf/mIy8+9MH7xqWeWX37uhfEN618+IupM/VFOF1RjQ4PmX3SJLpv1Sc2/6BI1NjREHQlA4Dasf/mIr3x0bvkpP982Yubv9x55ys+3jfjKR+eW91RJ/fjHPx5mZlOfe+65gZK0YcOGI8rKyk6QpIceeujIWbNmHdcT5zlc6bmyJWcLqrGhQQtmz1H58j9qxm93qXz5H7Vg9hxKCsC7uumr3yo58/UjYzHLlyTFLF9nvn5k7KavfqukJ45/9913j5gyZcpfli5dOqInjteX5WxB1S6sUeWmmNL/kVVuiql2YU3EyQCEbM/2nQM6vm90iFm+9vx554DDPXZLS0tefX39kB/96EeN999///BD/fqHH354yPjx4yeOHz9+4oQJEybu3Lkzr6WlJe+0004rnzhx4oTy8vKJy5YtGyYlR0DxePyEz3zmM8eWlZWdcM4558QfeOCBI6dMmTL+2GOPnfT4448PlqT58+ePPu+88+IzZswoP/bYYyfdeOONRV3P29bWpssuu2zMpEmTJpSXl09cvHhxkSS9+uqrAyoqKo4fP378xLKyshNWrlw55FB+Pzn7PqjWLc3q7h9Z69YdESUC0BcMGjV8b8K37ff9I+HtGvS+o/ce7rGXL18+bObMmS0nnXRSYtiwYe1PPfXU4OLi4raD/fobb7xxVG1t7asf+chHWltaWvIGDx68T5IefvjhP40YMWLftm3bCk499dTxc+bMeVOSXn/99YE/+clPXpk6deqrJ5100oTly5ePrK+vf3nFihXDrrvuumNmzZq1SZLWr18/aM2aNet3796dP3ny5Innn39+S/p5b7755qKhQ4e2r127dv2ePXts2rRp4z/xiU/suuuuu4ZXVla23HDDDdvb2tq0e/fuQxoU5ewIqrCkSAlv329bwttVOHpkRIkA9AVXXf/NLSvH7k50fP9IeLtWjt2duOr6b2453GPfc889Iy688MKdknT++ef/76Fe5psxY8ZfrrnmmrHf+c53jm5ubs4fMGCA9u3bZ1deeeWY8vLyibNmzSp/4403jti8eXOBJJWUlCSmT5++Jz8/X+Xl5XvOOOOMXXl5eZoyZcpfN2/eHOs47llnnfXmkCFD/Jhjjmk77bTTdj355JOF6ed97LHHjrrnnntGjh8/fuLkyZMn7Ny5s+Cll14aOGPGjNa77rqraP78+aOfeeaZQcOHD993KL+fnB1BVS2q1oK6OZ2X+RLertXjElq8qDrqaAACdvyE8W/d8OjyjTd99Vsle/68c8Cg9x2994brf7Dl+Anj3zqc427fvj2/rq7uqI0bNw664oor1N7ebmbmV1111TseAXQgNTU1288777yWBx98cOgHPvCBCStXrtz45JNPFu7YsaPgxRdfXB+LxbykpOTEPXv25EnSEUcc0fner7y8PA0cONAlKT8/X+3t7daxz8z2O0/Xz93dbrzxxtfOP//8XV0zPfHEExvuvffeoZ///OfjVVVVf77iiisO+jJVzhZUaTyuxatWqHZhjVq37lDh6JFavKhapfF41NEABO74CePf+sGDd/XojKqlS5cO/9SnPrVjxYoVnetVTZs27fjGxsaDnh24bt262PTp0/dMnz59z9NPP124du3agS0tLflFRUV7Y7GY/+IXvzhy69athzzb8Je//OWw6667btuuXbvy6urqjrzpppu2JBKJzpaaPXt2y6233lp89tln747FYv7CCy/ESktL927fvr0gHo+/dfXVVze3trbmPfvss4MlUVAHozQe15Jlt0cdAwD005/+dOS11167LX3bueeeu7OmpuaYgz3Gd7/73aN/97vfHZWXl+fl5eV7LrjggpY333wz/6yzzjpu0qRJE0444YS/xuPxvx1qtsmTJ7dWVlaWbd269YhrrrlmW2lp6d4NGzZ0Ft1VV13V3NjYGDvxxBMnuLuNGDFi7yOPPLLp0UcfPbK2tnZUQUGBDx48uH358uWHVOrm3nee7lFRUeGsqAvgcJjZGnevSN/2/PPPN5588snNUWUK2fz580cPGTKk/dvf/vafs3WO559/vujkk08u7bo9ZydJAADCltOX+ACgL7rllltG3nrrre9L3zZt2rS/LF269LWePteSJUu29vQxDxYFBQDSvn379llfeaL5vHnzdsybN69fvGlz3759puTChe/AJT4AkNY2NTUNTX2zRC9JLVg4VNLa7vYzggKQ89ra2r60ffv2O7Zv386S772rc8n37nZSUABy3tSpU99QN0uOI1r8pAAACBIFBQAIEgUFAAhSv7wH1djQkHzG3pZmFZYUqYpn7AFAn9PvCqpjpdy3n1K+Uwvq5mjxqhWUFAD0If3uEh8r5QJA/9DvCoqVcgGgf+h3BcVKuQDQP/S7gqpaVK3V4xJKX4559biEqlgpFwD6lH5XUB0r5W6cW6a6mUO1cW4ZEyQAoA/qd7P4JFbKBYD+oN+NoAAA/QMFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACFLkBWVm+Wb2nJk9FHUWAEA4Ii8oSfMkrY86BAAgLJEWlJmNkfRxSXdEmQMAEJ6oR1A3S7pW0r6IcwAAAhNZQZnZ2ZLecPc1GV53qZnVm1l9U1NTL6UDAEQtyhHU6ZLOMbNGSXdLOsPMlnV9kbvf5u4V7l5RXFzc2xkBABGJrKDc/WvuPsbdSyV9VtKv3f2iqPIAAMIS9T0oAAC6FcSS7+7+G0m/iTgGACAgjKAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQQpiuQ0AOFSNDQ2qXVij1i3NKiwpUtWiapXG41HHQg+ioAD0OY0NDVowe44qN8UUs3wlfKcW1M3R4lUrKKl+hEt8APqc2oU1neUkSTHLV+WmmGoX1kScDD2JggLQ57Ruae4spw4xy1fr1h0RJUI2UFAA+pzCkiIlvH2/bQlvV+HokRElQjZQUAD6nKpF1Vo9LtFZUglv1+pxCVUtqo44GXoSBQWgzymNx7V41QptnFumuplDtXFuGRMk+iFm8QHok0rjcS1ZdnvUMZBFjKAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQaKgAABBoqAAAEGioAAAQYqsoMxsrJk9bmbrzWydmc2LKgsAIDwFEZ67TdLV7v6smR0paY2ZrXL3lyLMBAAIRGQjKHff5u7Ppj7eLWm9pJKo8gAAwhLEPSgzK5U0WdLT0SYBAIQi8oIysyGS7pV0pbvv6mb/pWZWb2b1TU1NvR8QABCJSAvKzAYoWU7L3f2+7l7j7re5e4W7VxQXF/duQABAZKKcxWeS7pS03t2XRJUDABCmKEdQp0v6nKQzzOwPqV8fizAPACAgkU0zd/enJFlU5wcAhC3ySRIAAHSHggIABImCAgAEiYICAASJggIABImCAgAEiYICAASJggIABImCAgAEiYICAASJggIABImCAgAEiYICAATpkArKzArNLD9bYQAA6PCuBWVmeWY2x8weNrM3JL0saZuZrTOzxWZW1jsxAQC5JtMI6nFJ4yR9TdIodx/r7kdL+pCkOknXm9lFWc4IAMhBmRYs/LC77+260d3/V9K9ku41swFZSQYAyGnvWlBdy8nMBkq6SNIgSSvcfUd3BQYAwOE61Fl8t0jKl/Q3SQ/0fBwAAJIyTZJYYWbj0jaNkLRc0l2ShmczGAAgt2W6B/Wvkr5jZlslLZL0PUk/lzRQ0r9lNxoAIJdlugf1iqQ5ZvZBST+R9LCk2e7e3hvhAAC5K9MlvuFm9mVJEyV9WlKLpEfN7OzeCAcAyF2ZJkk8ICmh5CW9pe7+Y0mfkDTVzH6e7XAAgNyV6R7USEkrlJxW/k+S5O57JH3LzI7JcjYAQA7LVFDfkLRKUrukr6bvcPdt2QoFAECmSRL3Sbqvl7IAANDpXQvKzPIkXSzpfEljJbVJ+qOkH7j7b7KeDgCQszJd4rtT0quSrpd0gaRdkp6U9K9mdqK7/98s5wMA5KhMBTXV3b+Q+vgpM6tz92+Y2ROS/iCJggIAZEWmaeZ7Ox51ZGZTJL0lSe6ekORZzgYAyGGZRlALJD1uZn+TNEDSZyXJzIolPZTlbACAHJZpFt+vzexYSSPdvTlte5Oka7MdDgCQuzIut+FJzV23m9mo7EQCAODQ14NKd2ePpQAAoIv3XFDu/vGeDAIAQLpMkyQkdU6KGKPkG3Ub3P0vWU0FAMh5mZ4kMVFSraRSSe+X9Jyko83st5LmuXtL1hMCAHJSpkt8P5T0ZXc/TtIHJb3s7nFJ/y3uQQEAsihTQQ1y9w2S5O7PSDox9fHtSi5iCABAVmS6B7XJzBZKWi3pU0o+3khmNuAgvhYAgPcs0wjqi5KOlFSt5Mq681LbByu1gCEAANmQ6UkSb6qbJ0akJkfUZSsUAADv+X1QZnZbTwYBACBdpmnmIw60S9LHej4OAABJmSY6NCm5YKGlbfPU50dnKxQAAJkK6hVJle7+WtcdZvb64Z7czM6UdIukfEl3uPv1h3tMAED/kOke1M2Shh9g33cP58Rmli/p+5LOUvI9VRemnlwBAEDGWXzff5d9h7vc+3RJf3L3VyTJzO6WdK6klw7zuACAfuBdR1Bm9sEM+48ys0nv8dwlktIvE25Obet6jkvNrN7M6puamt7jqQAAfU2me1Dnm9l3Ja2UtEbJSRMDJR0naZakYyVd/R7Pbd1s83dscL9N0m2SVFFR8Y79AID+KdMlvqvMbLikCyT9o6RjJO2RtF7S/3P3pw7j3JsljU37fIykrYdxPABAP5LxeXruvlPS7alfPen3ksrMLC5pi6TPSprTw+cAAPRRkT3w1d3bzOwKSY8qOc38h+6+Lqo8AICwRPpEcnd/RNIjUWYAAITpPT+LDwCAbMpYUKmp5OO62X5SdiIBAJD5fVCflvSypHvNbJ2ZTUvb/Z/ZDAYAyG2ZRlDVkqa6+ymSviBpqZl9KrWvu/cxAQDQIzJNksh3922S5O7PmNksSQ+Z2Rh186ZaAAB6SqYR1O70+0+pspqp5DPzTshiLgBAjss0gvo/6nIpz913p5bJ+HTWUgEAcl6mEVSrpPd1s32GpLqejwMAQNLBrAe1u5vte1L7AADIikwFVeruL3Td6O71kkqzkggAAGUuqIHvsm9QTwYBACBdpoL6vZld0nWjmf2zkutDAQCQFZlm8V0p6X4zm6u3C6lC0hGSPpnNYACA3JZpwcI/S/pA6g26HUu7P+zuv856MgBATnvXgjKzgZIuV3KJ9xcl3enubb0RDACQ2zLdg/ovJS/pvSjpLEnfy3oiAACU+R7URHc/UZLM7E5Jz2Q/EgAAmUdQezs+4NIeAKA3ZRpBnWxmu1Ifm6RBqc9Nkrv7UVlNBwDIWZlm8eX3VhCEqbGhQbULa9S6pVmFJUWqWlSt0ng86lgAckCmERRyWGNDgxbMnqPKTTHFLF8J36kFdXO0eNUKSgpA1mW6B4UcVruwprOcJClm+arcFFPtwpqIkwHIBRQUDqh1S3NnOXWIWb5at+6IKBGAXEJB4YAKS4qU8Pb9tiW8XYWjR0aUCEAuoaBwQFWLqrV6XKKzpBLertXjEqpaVB1xMgC5gILCAZXG41q8aoU2zi1T3cyh2ji3rE9MkGhsaND8iy7RZbM+qfkXXaLGhoaoIwF4D8zdo85w0CoqKry+vj7qGAjYO2ceJkd9faFY0TvMbI27V0SdA5kxgkK/wsxDoP+goNCvMPMQ6D8oKPQrzDwE+g8KCv0KMw+B/oOCQr/SV2ceAngnnsWHfqc0HteSZbdHHQPAYWIEBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACFIkBWVmi83sZTN7wczuN7NhUeQAAIQrqhHUKkmT3P0kSRslfS2iHACAQEVSUO7+K3dvS31aJ2lMFDkAAOEK4R7UFyX98kA7zexSM6s3s/qmpqZejAUAiFLWFiw0s8ckjepm19fd/cHUa74uqU3S8gMdx91vk3SbJFVUVHgWogIAApS1gnL3D7/bfjO7WNLZkirdneIBAOwnkiXfzexMSV+R9A/u/tcoMiA6jQ0Nql1Yo9YtzSosKVLVomqVxuNRxwIQmEgKStK/S4pJWmVmklTn7pdHlAW9qLGhQQtmz1Hlpphilq+E79SCujlavGoFJQVgP1HN4jvO3ce6+ympX5RTjqhdWNNZTpIUs3xVboqpdmFNxMkAhCaEWXzIIa1bmjvLqUPM8tW6dUdEiQCEioJCryosKVLC2/fblvB2FY4eGVEiAKGioNCrqhZVa/W4RGdJJbxdq8clVLWoOuJkAEJDQaFXlcbjWrxqhTbOLVPdzKHaOLeMCRIAuhXVLD7ksNJ4XEuW3R51DACBYwQFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFAAgSBQUACBIFBQAIUkHUARCGxoYG1S6sUeuWZhWWFKlqUbVK4/GoYwHIYRQU1NjQoAWz56hyU0wxy1fCd2pB3RwtXrWCkgIQGS7xQbULazrLSZJilq/KTTHVLqyJOBmAXEZBQa1bmjvLqUPM8tW6dUdEiQAg4oIys2vMzM2sKMocua6wpEgJb99vW8LbVTh6ZESJACDCgjKzsZJmS3otqgxIqlpUrdXjEp0llfB2rR6XUNWi6oiTAchlUY6gbpJ0rSSPMAMklcbjWrxqhTbOLVPdzKHaOLeMCRIAIhfJLD4zO0fSFnd/3swyvfZSSZdK0vvf//5eSJebSuNxLVl2e9QxAKBT1grKzB6TNKqbXV+XVC3pIwdzHHe/TdJtklRRUcFoCwByRNYKyt0/3N12MztRUlxSx+hpjKRnzWy6u2/PVh4AQN/S65f43P1FSUd3fG5mjZIq3L25t7MAAMLF+6AAAEGK/FFH7l4adQYAQHgYQQEAgkRBAQCCREEBAIJEQQEAgkRBAQCCREEBAIJEQQEAgkRBAQCCREEBAIJEQQEAgkRBAQCCREEBAIJEQQEAgkRBAQCCFPlyG6FpbGhQ7cIatW5pVmFJkaoWVas0Ho86FgDkHAoqTWNDgxbMnqPKTTHFLF8J36kFdXO0eNUKSgoAehmX+NLULqzpLCdJilm+KjfFVLuwJuJkAJB7KKg0rVuaO8upQ8zy1bp1R0SJACB3UVBpCkuKlPD2/bYlvF2Fo0dGlAgAchcFlaZqUbVWj0t0llTC27V6XEJVi6ojTgYAuYeCSlMaj2vxqhXaOLdMdTOHauPcMiZIAEBEmMXXRWk8riXLbo86BgDkPEZQAIAgUVAAgCBRUACAIFFQAIAgUVAAgCBRUACAIFFQAIAgUVAAgCBRUACAIJm7R53hoJlZk6RXs3DoIknNWTju4SBTZqHlkch0MKLOc6y7F0d4fhykPlVQ2WJm9e5eEXWOdGTKLLQ8EpkORmh5EC4u8QEAgkRBAQCCREEl3RZ1gG6QKbPQ8khkOhih5UGguAcFAAgSIygAQJAoKABAkCioLszsGjNzMysKIMtiM3vZzF4ws/vNbFhEOc40sw1m9icz+2oUGbrkGWtmj5vZejNbZ2bzos4kSWaWb2bPmdlDUWeRJDMbZmY/S/0bWm9mpwWQ6arU39laM7vLzAZGnQnhoqDSmNlYSbMlvRZ1lpRVkia5+0mSNkr6Wm8HMLN8Sd+XdJakiZIuNLOJvZ2jizZJV7v7BEkzJH05gEySNE/S+qhDpLlF0kp3Hy/pZEWczcxKJFVJqnD3SZLyJX02ykwIGwW1v5skXSspiJkj7v4rd29LfVonaUwEMaZL+pO7v+Lub0m6W9K5EeTo5O7b3P3Z1Me7lfzGWxJlJjMbI+njku6IMkcHMztK0t9LulOS3P0td38z2lSSpAJJg8ysQNJgSVsjzoOAUVApZnaOpC3u/nzUWQ7gi5J+GcF5SyS9nvb5ZkVcBunMrFTSZElPR5tENyv5w82+iHN0+DtJTZJ+lLrseIeZFUYZyN23SPqeklcotklqcfdfRZkJYcupgjKzx1LXvrv+OlfS1yV9I7BMHa/5upKXtZb3dj5J1s22IEaYZjZE0r2SrnT3XRHmOFvSG+6+JqoM3SiQNEXSre4+WVKrpEjvH5rZcCVH33FJoyUVmtlFUWZC2AqiDtCb3P3D3W03sxOV/E/zvJlJyUtpz5rZdHffHkWmtGwXSzpbUqVH86a1zZLGpn0+RgFcljGzAUqW03J3vy/iOKdLOsfMPiZpoKSjzGyZu0f5zXezpM3u3jGy/JkiLihJH5bU4O5NkmRm90n6gKRlkaZCsHJqBHUg7v6iux/t7qXuXqrkf+4p2S6nTMzsTElfkXSOu/81ohi/l1RmZnEzO0LJm9o/jyiLJMmSP0XcKWm9uy+JMoskufvX3H1M6t/OZyX9OuJyUurf7utmdnxqU6WklyKMJCUv7c0ws8Gpv8NKhTWpBIHJqRFUH/TvkmKSVqVGdnXufnlvBnD3NjO7QtKjSs66+qG7r+vNDN04XdLnJL1oZn9Ibat290cizBSif5G0PPWDxSuSvhBlGHd/2sx+JulZJS9ZPycee4R3waOOAABB4hIfACBIFBQAIEgUFAAgSBQUACBIFBQAIEgUFLLCzNrN7A+pp2L81MwGp7aPMrO7zWyTmb1kZo+YWXlq30ozezPT08DN7GYz+/vUx8tTT1pfa2Y/TL2Bt+N1M1MZ1pnZbw9wrLiZPW1mfzSzn6SmZMvM/iV1zEfStn3QzJakfW2xma083D8rAN2joJAte9z9lNRTq9+SdHnqzZn3S/qNu49z94mSqiW9L/U1i5V8f9MBmdkISTPc/YnUpuWSxks6UdIgSV9KvW6YpP9Q8k3OJ0j6xwMc8gZJN7l7maSdkv45tf1Lkk5S8r06H01lXyhpUccXpp6IsM3MTj+YPxAAh4aCQm94UtJxkmZJ2uvuP+jY4e5/cPcnUx+vlrQ7w7EukNQ5anH3RzxF0jN6+4nvcyTd5+6vpV73RtcDpUrnDCUfAyRJ/yXpvLSXDFDyidt7lSzOR9x9Z5fDPCBpbobMAN4DCgpZlVpW4SxJL0qaJOlwH6h6enfHSF3a+5zeLq9yScPN7DdmtsbM/qmbY42U9GbakibpT2r/npJLnBRL+m9JFys5IuuqXtKH3uPvBcC7oKCQLYNSjyGqV/IZbHf20HGPUXIZia7+Q9ITHaMxJR/jNVXJNZo+Kmlhx72uNAd8Uru7L3X3yaln6s2XVCvprNQKtTeZWcf/nTeUfDI3gB7Gs/iQLXvc/ZT0DWa2TslLdId1XCWfGJ5+3G8qOdK5LG3zZknN7t4qqdXMnlByVdmNaa9pljTMzApSo6h3PKndzEZLmubu3zKzZySdJuk6JR90uiqVZc9h/p4AdIMRFHrTryXFzOySjg1mNs3M/uEQjrFeyftZHV//JSVHSBe6e/pigQ9K+pCZFaRmEJ6qLk/OTt23elxvl+bFqa9Lt0jJyRFSchKGK7ko4eDUtnJJaw8hP4CDREGh16QK4ZOSZqemma+T9G9KjVrM7ElJP5VUaWabzeyj3RzmYUkz0z7/gZKzAP8nNaX8G6lzrVfyftQLSk6euMPd16bO80hqZCQllzOZb2Z/UvKeVOelSDObnDrWc6lNdyp5L22K3r7XNSuVCUAP42nm6HPM7ClJZ7v7mwFkeULSud3M7gNwmCgo9DlmdqqS97heiDhHsaTT3f2BKHMA/RUFBQAIEvegAABBoqAAAEGioAAAQaKgAABBoqAAAEH6/0Sc7jypEfkTAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "local_Ne = 1e5\n",
    "g = 0\n",
    "split_time = 2e5\n",
    "pop1_samples = 8\n",
    "pop2_samples = 8\n",
    "nloci = 100\n",
    "length=150\n",
    "\n",
    "migmat = [[0, 0], [0, 0]]\n",
    "\n",
    "def simulate_nloci(n=1, split_time=2e5, pop1_samples=8, pop2_samples=8, nloci=100, migmat=[[0, 0], [0, 0]]):\n",
    "    ts_list = []\n",
    "    for i in xrange(n):\n",
    "        pop1 = msprime.PopulationConfiguration(sample_size=pop1_samples,\n",
    "                                            initial_size=local_Ne,\n",
    "                                            growth_rate=g)\n",
    "\n",
    "        pop2 = msprime.PopulationConfiguration(sample_size=pop2_samples,\n",
    "                                            initial_size=local_Ne,\n",
    "                                            growth_rate=g)\n",
    "\n",
    "        split_event = msprime.MassMigration(time=split_time,\n",
    "                                            source=0,\n",
    "                                            destination=1,\n",
    "                                            proportion=1)\n",
    "\n",
    "        debug = msprime.DemographyDebugger(population_configurations=[pop1, pop2],\n",
    "                                            demographic_events=[split_event],\n",
    "                                            migration_matrix=migmat)\n",
    "\n",
    "        tree_sequence = msprime.simulate(length=length,\\\n",
    "                                            migration_matrix=migmat,\\\n",
    "                                            mutation_rate=1e-8, \\\n",
    "                                            population_configurations=[pop1, pop2],\\\n",
    "                                            demographic_events=[split_event])\n",
    "        ts_list.append(tree_sequence)\n",
    "\n",
    "    return ts_list\n",
    "\n",
    "def plot_tree_sequence(tree_sequence):\n",
    "    tree = tree_sequence.first()\n",
    "    colour_map = {0:\"red\", 1:\"blue\"}\n",
    "    node_colours = {u: colour_map[tree.population(u)] for u in tree.nodes()}\n",
    "    display(SVG(tree.draw(width=600, height=400, node_colours=node_colours)))\n",
    "\n",
    "def write_vcf(ts_list, outfile, unlinked=False):\n",
    "    ## get the header\n",
    "    head = []\n",
    "    with TemporaryFile() as outtmp:\n",
    "        ts_list[0].write_vcf(outtmp, ploidy=2, contig_id='1')\n",
    "        outtmp.seek(0)\n",
    "        head = outtmp.readlines()[:6]\n",
    "\n",
    "    with open(outfile, 'w') as output:\n",
    "        output.write(\"\".join(head))\n",
    "        for i, ts in enumerate(ts_list):\n",
    "            with TemporaryFile() as outtmp:\n",
    "                ## the +2 here is because enumerate starts at 0, and we\n",
    "                ## want our first locus of this list to start at 2\n",
    "                ts.write_vcf(outtmp, ploidy=2, contig_id=str(i+2))\n",
    "                outtmp.seek(0)\n",
    "                ## Get rid of the 6 lines of vcf header\n",
    "                dat = outtmp.readlines()[6:]\n",
    "                ## Add the AA field\n",
    "                new_dat = [x[:x.rfind(\".\")] + \"AA=A\" + x[x.rfind(\".\")+1:] for x in dat]\n",
    "                dat = new_dat\n",
    "                if unlinked:\n",
    "                    try:\n",
    "                        dat = np.random.choice(dat, size=1)\n",
    "                    except:\n",
    "                        ## ignore loci with no snps\n",
    "                        pass\n",
    "                output.write(\"\".join(dat))\n",
    "\n",
    "def simulate_missing(infile, outfile=None, avg_missing=0.3, std_missing=0.05):\n",
    "    if outfile == None:\n",
    "        outfile = infile\n",
    "    dat = ''\n",
    "    header = ''\n",
    "    with open(infile) as infile:\n",
    "        dat = infile.readlines()\n",
    "        header = dat[:6]\n",
    "        dat = pd.DataFrame([x.split() for x in dat[6:]], columns=header[-1].split())\n",
    "        samps = [x for x in dat.columns if \"msp\" in x]\n",
    "        missingness = np.random.normal(avg_missing, std_missing, len(samps))\n",
    "        for samp, miss in zip(samps, missingness):\n",
    "            #print(samp, miss, len(dat[samp]))\n",
    "            draws = np.random.uniform(size=len(dat[samp]))\n",
    "            dat[samp] = np.where(draws < miss, \".|.\", dat[samp])\n",
    "    with open(outfile, 'w') as outfile:\n",
    "        outfile.write(\"\".join(header))\n",
    "        dat.to_csv(outfile, sep=\"\\t\", index=False, header=False)\n",
    "\n",
    "## 100 loci is almost instant\n",
    "## 1000 loci ~5 seconds\n",
    "migmat = [[0, 1e-5], [1e-5, 0]]\n",
    "ts_list = simulate_nloci(n=nloci, migmat=migmat)\n",
    "#plot_tree_sequence(ts_list[0])\n",
    "write_vcf(ts_list, \"./{}/tmp.vcf\".format(simout), unlinked=True)\n",
    "simulate_missing(\"{}/tmp.vcf\".format(simout), avg_missing=0.5)\n",
    "pca = ipa.pca(\"./{}/tmp.vcf\".format(simout))\n",
    "pca.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simulate vcf files serially"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Doing no_migration 0\n",
      "Doing no_migration 1\n",
      "Doing no_migration 2\n",
      "Doing no_migration 3\n",
      "Doing no_migration 4\n",
      "Doing symmetric_migration 0\n",
      "Doing symmetric_migration 1\n",
      "Doing symmetric_migration 2\n",
      "Doing symmetric_migration 3\n",
      "Doing symmetric_migration 4\n",
      "Doing asymmetric_migration 0\n",
      "Doing asymmetric_migration 1\n",
      "Doing asymmetric_migration 2\n",
      "Doing asymmetric_migration 3\n",
      "Doing asymmetric_migration 4\n"
     ]
    }
   ],
   "source": [
    "nsims = 5\n",
    "models = {\"no_migration\":[[0, 0], [0, 0]],\n",
    "          \"symmetric_migration\":[[0, 1e-5], [1e-5, 0]],\n",
    "          \"asymmetric_migration\":[[0, 1e-5], [1e-6, 0]]}\n",
    "\n",
    "for model, migmat in models.items():\n",
    "    outdir = \"./{}/{}\".format(simout, model)\n",
    "    if not os.path.exists(outdir):\n",
    "        os.mkdir(outdir)\n",
    "    for i in xrange(nsims):\n",
    "        print(\"Doing {} {}\".format(model, i))\n",
    "        ts_list = simulate_nloci(nloci, migmat=migmat)\n",
    "        write_vcf(ts_list, \"./{}/{}-sim{}.vcf\".format(outdir, model, i), unlinked=False)\n",
    "        simulate_missing(\"{}/{}-sim{}.vcf\".format(outdir, model, i))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simulate vcf files in parallel\n",
    "Broken"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Define a function so we can parallelize the above cell\n",
    "def sim_vcf(model, nloci, missingness, migmat, outdir, rep):\n",
    "    ts_list = simulate_nloci(nloci, migmat=migmat)\n",
    "    write_vcf(ts_list, \"./{}/{}-sim{}.vcf\".format(outdir, model, rep), unlinked=False)\n",
    "    simulate_missing(\"{}/{}-sim{}.vcf\".format(outdir, model, rep), avg_missing=missingness)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    }
   ],
   "source": [
    "## Run this in a python2.7 env and wait for clients to attach\n",
    "#!ipcluster start -n 4 --cluster-id=ipp2 --daemonize\n",
    "ipyclient = ipp.Client(cluster_id=\"ipp2\")\n",
    "print(len(ipyclient.ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [####################] 100%  Simulating vcf files  |  0:00:01 \n",
      "  [####################] 100%  Simulating vcf files  |  0:00:01 \n",
      "  [####################] 100%  Simulating vcf files  |  0:00:01 \n"
     ]
    }
   ],
   "source": [
    "nsims = 5\n",
    "nloci = 1000\n",
    "missingness = .3\n",
    "models = {\"no_migration\":[[0, 0], [0, 0]],\n",
    "          \"symmetric_migration\":[[0, 1e-5], [1e-5, 0]],\n",
    "          \"asymmetric_migration\":[[0, 1e-5], [1e-6, 0]]}\n",
    "\n",
    "ipyclient[:].use_dill()\n",
    "ipyclient[:].push(dict(simulate_nloci=simulate_nloci, write_vcf=write_vcf, simulate_missing=simulate_missing))\n",
    "lbview = ipyclient.load_balanced_view()\n",
    "parallel_jobs = {}\n",
    "\n",
    "start = time.time()\n",
    "printstr = \" Simulating vcf files  |  {}\"\n",
    "for model, migmat in models.items():\n",
    "    outdir = \"./{}/{}\".format(simout, model)\n",
    "    if not os.path.exists(outdir):\n",
    "        os.mkdir(outdir)\n",
    "    for rep in xrange(nsims):\n",
    "        parallel_jobs[rep] = lbview.apply(sim_vcf, *(model, nloci, missingness, migmat, outdir, rep))\n",
    "    while 1:\n",
    "        fin = [i.ready() for i in parallel_jobs.values()]\n",
    "        elapsed = datetime.timedelta(seconds=int(time.time()-start))\n",
    "        progressbar(len(fin), sum(fin),\n",
    "                    printstr.format(elapsed))\n",
    "        time.sleep(0.1)\n",
    "        if len(fin) == sum(fin):\n",
    "            print(\"\")\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "metadata": {},
   "outputs": [
    {
     "ename": "RemoteError",
     "evalue": "NameError(global name 'simulate_nloci' is not defined)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m\u001b[0m\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)\u001b[0;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m",
      "\u001b[0;32m<ipython-input-192-56baa67d5954>\u001b[0m in \u001b[0;36msim_vcf\u001b[0;34m(model, nloci, missingness, migmat, outdir, rep)\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: global name 'simulate_nloci' is not defined"
     ]
    }
   ],
   "source": [
    "#parallel_jobs[2].result()\n",
    "ipyclient[0].push(dict(simulate_nloci=simulate_nloci, write_vcf=write_vcf, simulate_missing=simulate_missing))\n",
    "wat = ipyclient[0].apply(sim_vcf, *(model, nloci, .3, migmat, \"/tmp/\", 0))\n",
    "print(wat.result())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Crap below here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.3044425  0.29402902 0.2848071  0.28072836 0.32856761 0.2867087\n",
      " 0.28905754 0.28444419]\n",
      "[0.13055066 0.27542815 0.31284536 0.08255794 0.0181362  0.28084621\n",
      " 0.25152279 0.01150193]\n",
      "6\t60\t.\tA\tT\t.\tPASS\t.\tGT\t1|0\t0|0\t0|0\t1|0\t0|0\t0|0\t0|0\t0|0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "avg_missing = 0.3\n",
    "std_missing = 0.05\n",
    "\n",
    "missingness = np.random.normal(avg_missing, std_missing, 8)\n",
    "print(missingness)\n",
    "draw = np.random.uniform(size=8)\n",
    "print(draw)\n",
    "ts = ts_list[1]\n",
    "with TemporaryFile() as outtmp:\n",
    "    ts.write_vcf(outtmp, ploidy=2, contig_id=str(i+2))\n",
    "    outtmp.seek(0)\n",
    "    ## Get rid of the 6 lines of vcf header\n",
    "    dat = outtmp.readlines()[6:]\n",
    "    print(dat[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "ts_list = simulate_nloci(1000)\n",
    "write_vcf(ts_list, \"./1000loci.vcf\", unlinked=True)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
back to top