From afc7f0b8d6d86785fc6cbd6d9ca5085327d4f920 Mon Sep 17 00:00:00 2001
From: cfeitong <cfeitong@gmail.com>
Date: Sat, 6 Jan 2018 00:57:23 +0800
Subject: [PATCH 1/6] local relinalg

---
 Cargo.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 5502be8f..f0a46151 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,5 +17,6 @@ datasets = []
 
 [dependencies]
 num = { version = "0.1.41", default-features = false }
-rand = "0.4.1"
-rulinalg = { git = "https://github.com/AtheMathmo/rulinalg", rev = "1ed8b937" }
+rand = "0.4"
+# rulinalg = { git = "https://github.com/AtheMathmo/rulinalg", rev = "1ed8b937" }
+rulinalg = { path = "../rulinalg" }

From 31a1137f81c7353581b7f794447285742ce0b86e Mon Sep 17 00:00:00 2001
From: cfeitong <cfeitong@gmail.com>
Date: Sun, 21 Jan 2018 13:26:10 +0800
Subject: [PATCH 2/6] save work

---
 src/learning/optim/grad_desc.rs | 43 ++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/learning/optim/grad_desc.rs b/src/learning/optim/grad_desc.rs
index 1e114877..b951b628 100644
--- a/src/learning/optim/grad_desc.rs
+++ b/src/learning/optim/grad_desc.rs
@@ -107,6 +107,8 @@ pub struct StochasticGD {
     mu: f64,
     /// The number of passes through the data.
     iters: usize,
+    /// Use Nesterove momentum or not
+    nesterove_momentum: bool,
 }
 
 /// The default Stochastic GD algorithm.
@@ -116,12 +118,14 @@ pub struct StochasticGD {
 /// - alpha = 0.1
 /// - mu = 0.1
 /// - iters = 20
+/// - nestorove = false
 impl Default for StochasticGD {
     fn default() -> StochasticGD {
         StochasticGD {
             alpha: 0.1,
             mu: 0.1,
             iters: 20,
+            nesterove_momentum: false,
         }
     }
 }
@@ -132,8 +136,6 @@ impl StochasticGD {
     /// Requires the learning rate, momentum rate and iteration count
     /// to be specified.
     /// 
-    /// With Nesterov momentum by default.
-    ///
     /// # Examples
     ///
     /// ```
@@ -149,8 +151,23 @@ impl StochasticGD {
             alpha: alpha,
             mu: mu,
             iters: iters,
+            nesterove_momentum: false,
         }
     }
+
+    /// Enable Nesterove momentum for stochastic gradient descent algorithm.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
+    ///
+    /// let sgd = StochasticGD::new(0.1, 0.3, 5).with_nesterove_momentum();
+    /// ```
+    pub fn with_nesterove_momentum(mut self) -> StochasticGD {
+        self.nesterove_momentum = true;
+        self
+    }
 }
 
 impl<M> OptimAlgorithm<M> for StochasticGD
@@ -184,15 +201,19 @@ impl<M> OptimAlgorithm<M> for StochasticGD
                                                           &inputs.select_rows(&[*i]),
                                                           &targets.select_rows(&[*i]));
 
-                // Backup previous velocity
-                let prev_w = delta_w.clone();
-                // Compute the difference in gradient using Nesterov momentum
-                delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
-                // Update the parameters
-                optimizing_val = &optimizing_val -
-                    (&prev_w * (-self.alpha) + &delta_w * (1. + self.alpha));
-                // Set the end cost (this is only used after the last iteration)
-                end_cost += cost;
+                if self.nesterove_momentum {
+                    // Backup previous velocity
+                    let prev_w = delta_w.clone();
+                    // Compute the difference in gradient using Nesterov momentum
+                    delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
+                    // Update the parameters
+                    optimizing_val = &optimizing_val -
+                        (&prev_w * (-self.alpha) + &delta_w * (1. + self.alpha));
+                    // Set the end cost (this is only used after the last iteration)
+                    end_cost += cost;
+                } else {
+
+                }
             }
 
             end_cost /= inputs.rows() as f64;

From 91f26b2d8fd91204300d0c9ba560e23f1583c4d0 Mon Sep 17 00:00:00 2001
From: cfeitong <cfeitong@gmail.com>
Date: Sun, 21 Jan 2018 13:28:47 +0800
Subject: [PATCH 3/6] Revert "Nesterov Momentum"

This reverts commit ddc4c767b0ecd3d93c0b554ee05d880449957320.
---
 src/learning/optim/grad_desc.rs   | 9 ++-------
 tests/learning/optim/grad_desc.rs | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/learning/optim/grad_desc.rs b/src/learning/optim/grad_desc.rs
index 1e114877..471edca4 100644
--- a/src/learning/optim/grad_desc.rs
+++ b/src/learning/optim/grad_desc.rs
@@ -131,8 +131,6 @@ impl StochasticGD {
     ///
     /// Requires the learning rate, momentum rate and iteration count
     /// to be specified.
-    /// 
-    /// With Nesterov momentum by default.
     ///
     /// # Examples
     ///
@@ -184,13 +182,10 @@ impl<M> OptimAlgorithm<M> for StochasticGD
                                                           &inputs.select_rows(&[*i]),
                                                           &targets.select_rows(&[*i]));
 
-                // Backup previous velocity
-                let prev_w = delta_w.clone();
-                // Compute the difference in gradient using Nesterov momentum
+                // Compute the difference in gradient using momentum
                 delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
                 // Update the parameters
-                optimizing_val = &optimizing_val -
-                    (&prev_w * (-self.alpha) + &delta_w * (1. + self.alpha));
+                optimizing_val = &optimizing_val - &delta_w * self.mu;
                 // Set the end cost (this is only used after the last iteration)
                 end_cost += cost;
             }
diff --git a/tests/learning/optim/grad_desc.rs b/tests/learning/optim/grad_desc.rs
index 9dd1281a..f9f74303 100644
--- a/tests/learning/optim/grad_desc.rs
+++ b/tests/learning/optim/grad_desc.rs
@@ -58,7 +58,7 @@ fn convex_gd_training() {
 fn convex_stochastic_gd_training() {
     let x_sq = XSqModel { c: 20f64 };
 
-    let gd = StochasticGD::new(0.9f64, 0.1f64, 100);
+    let gd = StochasticGD::new(0.5f64, 1f64, 100);
     let test_data = vec![100f64];
     let params = gd.optimize(&x_sq,
                               &test_data[..],

From f658b1a700f2ab2e2465fe3e7a9c8d5f5fbb5472 Mon Sep 17 00:00:00 2001
From: cfeitong <cfeitong@gmail.com>
Date: Sun, 21 Jan 2018 13:30:37 +0800
Subject: [PATCH 4/6] make Nesterove momentum for SGD optional

---
 src/learning/optim/grad_desc.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/learning/optim/grad_desc.rs b/src/learning/optim/grad_desc.rs
index b951b628..1a6bbb58 100644
--- a/src/learning/optim/grad_desc.rs
+++ b/src/learning/optim/grad_desc.rs
@@ -209,11 +209,15 @@ impl<M> OptimAlgorithm<M> for StochasticGD
                     // Update the parameters
                     optimizing_val = &optimizing_val -
                         (&prev_w * (-self.alpha) + &delta_w * (1. + self.alpha));
-                    // Set the end cost (this is only used after the last iteration)
-                    end_cost += cost;
                 } else {
-
+                    // Compute the difference in gradient using momentum
+                    delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
+                    // Update the parameters
+                    optimizing_val = &optimizing_val - &delta_w * self.mu;
                 }
+
+                // Set the end cost (this is only used after the last iteration)
+                end_cost += cost;
             }
 
             end_cost /= inputs.rows() as f64;

From fbe169b0c768531ccd9ccb1016023991e94971e5 Mon Sep 17 00:00:00 2001
From: cfeitong <cfeitong@gmail.com>
Date: Sun, 21 Jan 2018 13:32:11 +0800
Subject: [PATCH 5/6] add test for nesterove momentum

---
 tests/learning/optim/grad_desc.rs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/learning/optim/grad_desc.rs b/tests/learning/optim/grad_desc.rs
index 9dd1281a..ad9d6abd 100644
--- a/tests/learning/optim/grad_desc.rs
+++ b/tests/learning/optim/grad_desc.rs
@@ -69,6 +69,21 @@ fn convex_stochastic_gd_training() {
     assert!(x_sq.compute_grad(&params, &Matrix::zeros(1, 1), &Matrix::zeros(1, 1)).0 < 1e-10);
 }
 
+#[test]
+fn convex_stochastic_gd_nesterove_momentum_training() {
+    let x_sq = XSqModel { c: 20f64 };
+
+    let gd = StochasticGD::new(0.9f64, 0.1f64, 100).with_nesterove_momentum();
+    let test_data = vec![100f64];
+    let params = gd.optimize(&x_sq,
+                              &test_data[..],
+                              &Matrix::zeros(100, 1),
+                              &Matrix::zeros(100, 1));
+
+    assert!(params[0] - 20f64 < 1e-10);
+    assert!(x_sq.compute_grad(&params, &Matrix::zeros(1, 1), &Matrix::zeros(1, 1)).0 < 1e-10);
+}
+
 #[test]
 fn convex_adagrad_training() {
     let x_sq = XSqModel { c: 20f64 };

From ff681877d6e6b2b904b16c23f4b2190442fd765c Mon Sep 17 00:00:00 2001
From: cfeitong <cfeitong@gmail.com>
Date: Sun, 21 Jan 2018 13:40:41 +0800
Subject: [PATCH 6/6] revert cargo.toml

---
 Cargo.toml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index f0a46151..5502be8f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,5 @@ datasets = []
 
 [dependencies]
 num = { version = "0.1.41", default-features = false }
-rand = "0.4"
-# rulinalg = { git = "https://github.com/AtheMathmo/rulinalg", rev = "1ed8b937" }
-rulinalg = { path = "../rulinalg" }
+rand = "0.4.1"
+rulinalg = { git = "https://github.com/AtheMathmo/rulinalg", rev = "1ed8b937" }