Updated README and Python Code

anurag
1 parent 70ad245597
Showing 1 changed file with 217 additions and 0 deletions Show diff stats
nbi_simulation_new_for_git.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import csv\n",
+    "import numpy.matlib\n",
+    "from operator import itemgetter, attrgetter\n",
+    "from sklearn.model_selection import KFold\n",
+    "from sklearn.metrics import roc_curve, auc\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6319 3772 (3772, 6319) 26093.0 (6319, 3772)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#fp = open('dt_new_and_fda_unique.csv','r')\n",
+    "fp = open('new_dt_from_go_and_db_unique_latest.csv','r')\n",
+    "drugid = []\n",
+    "targetid = []\n",
+    "\n",
+    "##1. Reading edge list line by line##\n",
+    "for line in fp:\n",
+    "        line = line.strip()\n",
+    "        tmp = line.split(',')\n",
+    "        drugid.append(tmp[0])\n",
+    "        targetid.append(tmp[1])\n",
+    "fp.close()\n",
+    "##End 1##\n",
+    "\n",
+    "drug = np.array(drugid)\n",
+    "target =np.array(targetid)\n",
+    "\n",
+    "uni_drugid = np.unique(np.array(drugid))\n",
+    "uni_targetid = np.unique(np.array(targetid))\n",
+    "\n",
+    "##creating zero incidence matrix for the graph##\n",
+    "\n",
+    "A = np.zeros((uni_targetid.shape[0], uni_drugid.shape[0]))\n",
+    "\n",
+    "for i in range(len(drugid)):\n",
+    "                idx1 = np.where(uni_targetid==targetid[i])\n",
+    "                idx2 = np.where(uni_drugid==drugid[i])\n",
+    "                A[idx1,idx2] = 1\n",
+    "\n",
+    "nd = uni_drugid.shape[0]\n",
+    "mt = uni_targetid.shape[0]\n",
+    "\n",
+    "A_T = np.transpose(A)\n",
+    "no_edges = np.sum(A)\n",
+    "print nd, mt, A.shape, np.sum(A), A_T.shape\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(3772, 3772)\n",
+      "(3772, 6319)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#NBI calculation for A\n",
+    "\n",
+    "Ky = np.diag((1/sum(A))) \n",
+    "n = A.shape[0]\n",
+    "m = A.shape[1]\n",
+    "#print n, m, Ky.shape\n",
+    "Ky[np.isinf(Ky) | np.isnan(Ky)] = 0\n",
+    "kx = np.transpose(np.sum(A,1))\n",
+    "#print kx.shape\n",
+    "Nx = np.matlib.repmat(1/kx,n,1)\n",
+    "Nx[np.isinf(Nx) | np.isnan(Nx)] = 0\n",
+    "#kx[np.isinf(kx) | np.isnan(kx)] = 0\n",
+    "W = np.transpose(np.dot(A, Ky))\n",
+    "W1 = np.dot(A, W)\n",
+    "W2 = np.multiply(Nx, W1)\n",
+    "print W2.shape\n",
+    "NBIscore = np.dot(W2, A)\n",
+    "print NBIscore.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#NBI calculation for A_T\n",
+    "\n",
+    "Ky = np.diag((1/sum(A_T)))\n",
+    "n = A_T.shape[0]\n",
+    "m = A_T.shape[1]\n",
+    "#print n, m, Ky.shape\n",
+    "Ky[np.isinf(Ky) | np.isnan(Ky)] = 0\n",
+    "kx = np.transpose(np.sum(A_T,1))\n",
+    "#print kx.shape\n",
+    "Nx = np.matlib.repmat(1/kx,n,1)\n",
+    "Nx[np.isinf(Nx) | np.isnan(Nx)] = 0\n",
+    "#kx[np.isinf(kx) | np.isnan(kx)] = 0\n",
+    "W = np.transpose(np.dot(A_T, Ky))\n",
+    "W1 = np.dot(A_T, W)\n",
+    "W2 = np.multiply(Nx, W1)\n",
+    "NBIscore_T = np.dot(W2, A_T)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "##Normalizing NBI scores\n",
+    "NBIscore = np.true_divide(NBIscore, np.max(NBIscore, axis=0))\n",
+    "NBIscore_T = np.true_divide(NBIscore_T, np.max(NBIscore_T, axis=0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "nbi_idx = np.argsort(NBIscore, axis=0)\n",
+    "nbi_idx_T= np.argsort(NBIscore_T, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "wp = open('predicted_targets_for_all_drugs_using_percent_diff_0.20_.csv','w')\n",
+    "for d in range(nd):\n",
+    "    idx1 = nbi_idx[:,d]\n",
+    "    #idx1 = idx1[::-1]\n",
+    "    idx2 = A[:,d]\n",
+    "    idx3 = idx2[idx1]\n",
+    "    idx4 = np.where(idx3 == 0)[0]\n",
+    "    p_targets_idx = idx1[idx4[-n:]]\n",
+    "    p_targets_idx = p_targets_idx[::-1]\n",
+    "    p_targets = NBIscore[p_targets_idx,d]\n",
+    "    if p_targets[0] == 0.0:\n",
+    "        continue\n",
+    "    else:\n",
+    "        p_diff = np.diff(p_targets)\n",
+    "        th = p_targets[0]*0.20\n",
+    "        th_f = p_targets[0]-th\n",
+    "        f_idx = p_targets_idx[p_targets > th_f]\n",
+    "        f_scores = p_targets[p_targets > th_f]\n",
+    "        f_targets = uni_targetid[f_idx]\n",
+    "        #print p_targets[0], th, th_f, p_targets[p_targets > th_f], p_targets\n",
+    "        for i,t in enumerate(f_targets):\n",
+    "            wp.write(uni_drugid[d] + ',' + t + ',' + str(f_scores[i]) + '\\n')\n",
+    "\n",
+    "wp.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
...	...	@@ -0,0 +1,217 @@
	1	+{
	2	+ "cells": [
	3	+ {
	4	+ "cell_type": "code",
	5	+ "execution_count": 1,
	6	+ "metadata": {
	7	+ "collapsed": false
	8	+ },
	9	+ "outputs": [],
	10	+ "source": [
	11	+ "import numpy as np\n",
	12	+ "import csv\n",
	13	+ "import numpy.matlib\n",
	14	+ "from operator import itemgetter, attrgetter\n",
	15	+ "from sklearn.model_selection import KFold\n",
	16	+ "from sklearn.metrics import roc_curve, auc\n",
	17	+ "import matplotlib.pyplot as plt"
	18	+ ]
	19	+ },
	20	+ {
	21	+ "cell_type": "code",
	22	+ "execution_count": 2,
	23	+ "metadata": {
	24	+ "collapsed": false
	25	+ },
	26	+ "outputs": [
	27	+ {
	28	+ "name": "stdout",
	29	+ "output_type": "stream",
	30	+ "text": [
	31	+ "6319 3772 (3772, 6319) 26093.0 (6319, 3772)\n"
	32	+ ]
	33	+ }
	34	+ ],
	35	+ "source": [
	36	+ "#fp = open('dt_new_and_fda_unique.csv','r')\n",
	37	+ "fp = open('new_dt_from_go_and_db_unique_latest.csv','r')\n",
	38	+ "drugid = []\n",
	39	+ "targetid = []\n",
	40	+ "\n",
	41	+ "##1. Reading edge list line by line##\n",
	42	+ "for line in fp:\n",
	43	+ " line = line.strip()\n",
	44	+ " tmp = line.split(',')\n",
	45	+ " drugid.append(tmp[0])\n",
	46	+ " targetid.append(tmp[1])\n",
	47	+ "fp.close()\n",
	48	+ "##End 1##\n",
	49	+ "\n",
	50	+ "drug = np.array(drugid)\n",
	51	+ "target =np.array(targetid)\n",
	52	+ "\n",
	53	+ "uni_drugid = np.unique(np.array(drugid))\n",
	54	+ "uni_targetid = np.unique(np.array(targetid))\n",
	55	+ "\n",
	56	+ "##creating zero incidence matrix for the graph##\n",
	57	+ "\n",
	58	+ "A = np.zeros((uni_targetid.shape[0], uni_drugid.shape[0]))\n",
	59	+ "\n",
	60	+ "for i in range(len(drugid)):\n",
	61	+ " idx1 = np.where(uni_targetid==targetid[i])\n",
	62	+ " idx2 = np.where(uni_drugid==drugid[i])\n",
	63	+ " A[idx1,idx2] = 1\n",
	64	+ "\n",
	65	+ "nd = uni_drugid.shape[0]\n",
	66	+ "mt = uni_targetid.shape[0]\n",
	67	+ "\n",
	68	+ "A_T = np.transpose(A)\n",
	69	+ "no_edges = np.sum(A)\n",
	70	+ "print nd, mt, A.shape, np.sum(A), A_T.shape\n"
	71	+ ]
	72	+ },
	73	+ {
	74	+ "cell_type": "code",
	75	+ "execution_count": 3,
	76	+ "metadata": {
	77	+ "collapsed": false
	78	+ },
	79	+ "outputs": [
	80	+ {
	81	+ "name": "stdout",
	82	+ "output_type": "stream",
	83	+ "text": [
	84	+ "(3772, 3772)\n",
	85	+ "(3772, 6319)\n"
	86	+ ]
	87	+ }
	88	+ ],
	89	+ "source": [
	90	+ "#NBI calculation for A\n",
	91	+ "\n",
	92	+ "Ky = np.diag((1/sum(A))) \n",
	93	+ "n = A.shape[0]\n",
	94	+ "m = A.shape[1]\n",
	95	+ "#print n, m, Ky.shape\n",
	96	+ "Ky[np.isinf(Ky) \| np.isnan(Ky)] = 0\n",
	97	+ "kx = np.transpose(np.sum(A,1))\n",
	98	+ "#print kx.shape\n",
	99	+ "Nx = np.matlib.repmat(1/kx,n,1)\n",
	100	+ "Nx[np.isinf(Nx) \| np.isnan(Nx)] = 0\n",
	101	+ "#kx[np.isinf(kx) \| np.isnan(kx)] = 0\n",
	102	+ "W = np.transpose(np.dot(A, Ky))\n",
	103	+ "W1 = np.dot(A, W)\n",
	104	+ "W2 = np.multiply(Nx, W1)\n",
	105	+ "print W2.shape\n",
	106	+ "NBIscore = np.dot(W2, A)\n",
	107	+ "print NBIscore.shape"
	108	+ ]
	109	+ },
	110	+ {
	111	+ "cell_type": "code",
	112	+ "execution_count": 4,
	113	+ "metadata": {
	114	+ "collapsed": true
	115	+ },
	116	+ "outputs": [],
	117	+ "source": [
	118	+ "#NBI calculation for A_T\n",
	119	+ "\n",
	120	+ "Ky = np.diag((1/sum(A_T)))\n",
	121	+ "n = A_T.shape[0]\n",
	122	+ "m = A_T.shape[1]\n",
	123	+ "#print n, m, Ky.shape\n",
	124	+ "Ky[np.isinf(Ky) \| np.isnan(Ky)] = 0\n",
	125	+ "kx = np.transpose(np.sum(A_T,1))\n",
	126	+ "#print kx.shape\n",
	127	+ "Nx = np.matlib.repmat(1/kx,n,1)\n",
	128	+ "Nx[np.isinf(Nx) \| np.isnan(Nx)] = 0\n",
	129	+ "#kx[np.isinf(kx) \| np.isnan(kx)] = 0\n",
	130	+ "W = np.transpose(np.dot(A_T, Ky))\n",
	131	+ "W1 = np.dot(A_T, W)\n",
	132	+ "W2 = np.multiply(Nx, W1)\n",
	133	+ "NBIscore_T = np.dot(W2, A_T)"
	134	+ ]
	135	+ },
	136	+ {
	137	+ "cell_type": "code",
	138	+ "execution_count": 5,
	139	+ "metadata": {
	140	+ "collapsed": false
	141	+ },
	142	+ "outputs": [],
	143	+ "source": [
	144	+ "##Normalizing NBI scores\n",
	145	+ "NBIscore = np.true_divide(NBIscore, np.max(NBIscore, axis=0))\n",
	146	+ "NBIscore_T = np.true_divide(NBIscore_T, np.max(NBIscore_T, axis=0))"
	147	+ ]
	148	+ },
	149	+ {
	150	+ "cell_type": "code",
	151	+ "execution_count": 7,
	152	+ "metadata": {
	153	+ "collapsed": false
	154	+ },
	155	+ "outputs": [],
	156	+ "source": [
	157	+ "nbi_idx = np.argsort(NBIscore, axis=0)\n",
	158	+ "nbi_idx_T= np.argsort(NBIscore_T, axis=0)"
	159	+ ]
	160	+ },
	161	+ {
	162	+ "cell_type": "code",
	163	+ "execution_count": 8,
	164	+ "metadata": {
	165	+ "collapsed": false
	166	+ },
	167	+ "outputs": [],
	168	+ "source": [
	169	+ "wp = open('predicted_targets_for_all_drugs_using_percent_diff_0.20_.csv','w')\n",
	170	+ "for d in range(nd):\n",
	171	+ " idx1 = nbi_idx[:,d]\n",
	172	+ " #idx1 = idx1[::-1]\n",
	173	+ " idx2 = A[:,d]\n",
	174	+ " idx3 = idx2[idx1]\n",
	175	+ " idx4 = np.where(idx3 == 0)[0]\n",
	176	+ " p_targets_idx = idx1[idx4[-n:]]\n",
	177	+ " p_targets_idx = p_targets_idx[::-1]\n",
	178	+ " p_targets = NBIscore[p_targets_idx,d]\n",
	179	+ " if p_targets[0] == 0.0:\n",
	180	+ " continue\n",
	181	+ " else:\n",
	182	+ " p_diff = np.diff(p_targets)\n",
	183	+ " th = p_targets[0]*0.20\n",
	184	+ " th_f = p_targets[0]-th\n",
	185	+ " f_idx = p_targets_idx[p_targets > th_f]\n",
	186	+ " f_scores = p_targets[p_targets > th_f]\n",
	187	+ " f_targets = uni_targetid[f_idx]\n",
	188	+ " #print p_targets[0], th, th_f, p_targets[p_targets > th_f], p_targets\n",
	189	+ " for i,t in enumerate(f_targets):\n",
	190	+ " wp.write(uni_drugid[d] + ',' + t + ',' + str(f_scores[i]) + '\\n')\n",
	191	+ "\n",
	192	+ "wp.close()"
	193	+ ]
	194	+ }
	195	+ ],
	196	+ "metadata": {
	197	+ "kernelspec": {
	198	+ "display_name": "Python 2",
	199	+ "language": "python",
	200	+ "name": "python2"
	201	+ },
	202	+ "language_info": {
	203	+ "codemirror_mode": {
	204	+ "name": "ipython",
	205	+ "version": 2
	206	+ },
	207	+ "file_extension": ".py",
	208	+ "mimetype": "text/x-python",
	209	+ "name": "python",
	210	+ "nbconvert_exporter": "python",
	211	+ "pygments_lexer": "ipython2",
	212	+ "version": "2.7.6"
	213	+ }
	214	+ },
	215	+ "nbformat": 4,
	216	+ "nbformat_minor": 2
	217	+}
...	...