fix signatures for task1, now they synced to the ones in template_crossentropy.py

klensy · klensy · commit 96aa2ddc1e24 · 2025-09-17T16:34:19.000+03:00
diff --git a/homeworks/hw02_cross_entropy/01_crossentropy_method.ipynb b/homeworks/hw02_cross_entropy/01_crossentropy_method.ipynb
@@ -256,7 +256,7 @@
       },
       "outputs": [],
       "source": [
-        "def generate_session(policy, t_max=int(10**4)):\n",
+        "def generate_session(env, policy, t_max=int(10**4)):\n",
         "    \"\"\"\n",
         "    Play game until end or for t_max ticks.\n",
         "    :param policy: an array of shape [n_states,n_actions] with action probabilities\n",
@@ -293,7 +293,7 @@
       },
       "outputs": [],
       "source": [
-        "s, a, r = generate_session(policy)\n",
+        "s, a, r = generate_session(env, policy)\n",
         "assert type(s) == type(a) == list\n",
         "assert len(s) == len(a)\n",
         "assert type(r) in [float, np.float64]"
@@ -337,7 +337,7 @@
         "import matplotlib.pyplot as plt\n",
         "%matplotlib inline\n",
         "\n",
-        "sample_rewards = [generate_session(policy, t_max=1000)[-1] for _ in range(200)]\n",
+        "sample_rewards = [generate_session(env, policy, t_max=1000)[-1] for _ in range(200)]\n",
         "\n",
         "plt.hist(sample_rewards, bins=20)\n",
         "plt.vlines([np.percentile(sample_rewards, 50)], [0], [100], label=\"50'th percentile\", color='green')\n",
@@ -464,7 +464,7 @@
       },
       "outputs": [],
       "source": [
-        "def update_policy(elite_states, elite_actions):\n",
+        "def update_policy(elite_states, elite_actions, n_states, n_actions):\n",
         "    \"\"\"\n",
         "    Given old policy and a list of elite states/actions from select_elites,\n",
         "    return new updated policy where each action probability is proportional to\n",
@@ -493,7 +493,7 @@
         "elite_states = [1, 2, 3, 4, 2, 0, 2, 3, 1]\n",
         "elite_actions = [0, 2, 4, 3, 2, 0, 1, 3, 3]\n",
         "\n",
-        "new_policy = update_policy(elite_states, elite_actions)\n",
+        "new_policy = update_policy(elite_states, elite_actions, n_states, n_actions)\n",
         "\n",
         "assert np.isfinite(new_policy).all(\n",
         "), \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n",
@@ -587,13 +587,13 @@
         "\n",
         "for i in range(100):\n",
         "\n",
-        "    %time sessions = [generate_session(policy) for _ in range(n_sessions)]\n",
+        "    %time sessions = [generate_session(env, policy) for _ in range(n_sessions)]\n",
         "\n",
         "    states_batch, actions_batch, rewards_batch = zip(*sessions)\n",
         "\n",
         "    elite_states, elite_actions = select_elites(states_batch, actions_batch, rewards_batch, percentile)\n",
         "\n",
-        "    new_policy = update_policy(elite_states, elite_actions)\n",
+        "    new_policy = update_policy(elite_states, elite_actions, n_states, n_actions)\n",
         "\n",
         "    policy = learning_rate*new_policy + (1-learning_rate)*policy\n",
         "\n",