diff --git a/python_clustering/data/heart.csv b/python_clustering/data/heart.csv new file mode 100644 index 000000000..0966e67b5 --- /dev/null +++ b/python_clustering/data/heart.csv @@ -0,0 +1,304 @@ +age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output +63,1,3,145,233,1,0,150,0,2.3,0,0,1,1 +37,1,2,130,250,0,1,187,0,3.5,0,0,2,1 +41,0,1,130,204,0,0,172,0,1.4,2,0,2,1 +56,1,1,120,236,0,1,178,0,0.8,2,0,2,1 +57,0,0,120,354,0,1,163,1,0.6,2,0,2,1 +57,1,0,140,192,0,1,148,0,0.4,1,0,1,1 +56,0,1,140,294,0,0,153,0,1.3,1,0,2,1 +44,1,1,120,263,0,1,173,0,0,2,0,3,1 +52,1,2,172,199,1,1,162,0,0.5,2,0,3,1 +57,1,2,150,168,0,1,174,0,1.6,2,0,2,1 +54,1,0,140,239,0,1,160,0,1.2,2,0,2,1 +48,0,2,130,275,0,1,139,0,0.2,2,0,2,1 +49,1,1,130,266,0,1,171,0,0.6,2,0,2,1 +64,1,3,110,211,0,0,144,1,1.8,1,0,2,1 +58,0,3,150,283,1,0,162,0,1,2,0,2,1 +50,0,2,120,219,0,1,158,0,1.6,1,0,2,1 +58,0,2,120,340,0,1,172,0,0,2,0,2,1 +66,0,3,150,226,0,1,114,0,2.6,0,0,2,1 +43,1,0,150,247,0,1,171,0,1.5,2,0,2,1 +69,0,3,140,239,0,1,151,0,1.8,2,2,2,1 +59,1,0,135,234,0,1,161,0,0.5,1,0,3,1 +44,1,2,130,233,0,1,179,1,0.4,2,0,2,1 +42,1,0,140,226,0,1,178,0,0,2,0,2,1 +61,1,2,150,243,1,1,137,1,1,1,0,2,1 +40,1,3,140,199,0,1,178,1,1.4,2,0,3,1 +71,0,1,160,302,0,1,162,0,0.4,2,2,2,1 +59,1,2,150,212,1,1,157,0,1.6,2,0,2,1 +51,1,2,110,175,0,1,123,0,0.6,2,0,2,1 +65,0,2,140,417,1,0,157,0,0.8,2,1,2,1 +53,1,2,130,197,1,0,152,0,1.2,0,0,2,1 +41,0,1,105,198,0,1,168,0,0,2,1,2,1 +65,1,0,120,177,0,1,140,0,0.4,2,0,3,1 +44,1,1,130,219,0,0,188,0,0,2,0,2,1 +54,1,2,125,273,0,0,152,0,0.5,0,1,2,1 +51,1,3,125,213,0,0,125,1,1.4,2,1,2,1 +46,0,2,142,177,0,0,160,1,1.4,0,0,2,1 +54,0,2,135,304,1,1,170,0,0,2,0,2,1 +54,1,2,150,232,0,0,165,0,1.6,2,0,3,1 +65,0,2,155,269,0,1,148,0,0.8,2,0,2,1 +65,0,2,160,360,0,0,151,0,0.8,2,0,2,1 +51,0,2,140,308,0,0,142,0,1.5,2,1,2,1 +48,1,1,130,245,0,0,180,0,0.2,1,0,2,1 +45,1,0,104,208,0,0,148,1,3,1,0,2,1 +53,0,0,130,264,0,0,143,0,0.4,1,0,2,1 +39,1,2,140,321,0,0,182,0,0,2,0,2,1 +52,1,1,120,325,0,1,172,0,0.2,2,0,2,1 +44,1,2,140,235,0,0,180,0,0,2,0,2,1 +47,1,2,138,257,0,0,156,0,0,2,0,2,1 +53,0,2,128,216,0,0,115,0,0,2,0,0,1 +53,0,0,138,234,0,0,160,0,0,2,0,2,1 +51,0,2,130,256,0,0,149,0,0.5,2,0,2,1 +66,1,0,120,302,0,0,151,0,0.4,1,0,2,1 +62,1,2,130,231,0,1,146,0,1.8,1,3,3,1 +44,0,2,108,141,0,1,175,0,0.6,1,0,2,1 +63,0,2,135,252,0,0,172,0,0,2,0,2,1 +52,1,1,134,201,0,1,158,0,0.8,2,1,2,1 +48,1,0,122,222,0,0,186,0,0,2,0,2,1 +45,1,0,115,260,0,0,185,0,0,2,0,2,1 +34,1,3,118,182,0,0,174,0,0,2,0,2,1 +57,0,0,128,303,0,0,159,0,0,2,1,2,1 +71,0,2,110,265,1,0,130,0,0,2,1,2,1 +54,1,1,108,309,0,1,156,0,0,2,0,3,1 +52,1,3,118,186,0,0,190,0,0,1,0,1,1 +41,1,1,135,203,0,1,132,0,0,1,0,1,1 +58,1,2,140,211,1,0,165,0,0,2,0,2,1 +35,0,0,138,183,0,1,182,0,1.4,2,0,2,1 +51,1,2,100,222,0,1,143,1,1.2,1,0,2,1 +45,0,1,130,234,0,0,175,0,0.6,1,0,2,1 +44,1,1,120,220,0,1,170,0,0,2,0,2,1 +62,0,0,124,209,0,1,163,0,0,2,0,2,1 +54,1,2,120,258,0,0,147,0,0.4,1,0,3,1 +51,1,2,94,227,0,1,154,1,0,2,1,3,1 +29,1,1,130,204,0,0,202,0,0,2,0,2,1 +51,1,0,140,261,0,0,186,1,0,2,0,2,1 +43,0,2,122,213,0,1,165,0,0.2,1,0,2,1 +55,0,1,135,250,0,0,161,0,1.4,1,0,2,1 +51,1,2,125,245,1,0,166,0,2.4,1,0,2,1 +59,1,1,140,221,0,1,164,1,0,2,0,2,1 +52,1,1,128,205,1,1,184,0,0,2,0,2,1 +58,1,2,105,240,0,0,154,1,0.6,1,0,3,1 +41,1,2,112,250,0,1,179,0,0,2,0,2,1 +45,1,1,128,308,0,0,170,0,0,2,0,2,1 +60,0,2,102,318,0,1,160,0,0,2,1,2,1 +52,1,3,152,298,1,1,178,0,1.2,1,0,3,1 +42,0,0,102,265,0,0,122,0,0.6,1,0,2,1 +67,0,2,115,564,0,0,160,0,1.6,1,0,3,1 +68,1,2,118,277,0,1,151,0,1,2,1,3,1 +46,1,1,101,197,1,1,156,0,0,2,0,3,1 +54,0,2,110,214,0,1,158,0,1.6,1,0,2,1 +58,0,0,100,248,0,0,122,0,1,1,0,2,1 +48,1,2,124,255,1,1,175,0,0,2,2,2,1 +57,1,0,132,207,0,1,168,1,0,2,0,3,1 +52,1,2,138,223,0,1,169,0,0,2,4,2,1 +54,0,1,132,288,1,0,159,1,0,2,1,2,1 +45,0,1,112,160,0,1,138,0,0,1,0,2,1 +53,1,0,142,226,0,0,111,1,0,2,0,3,1 +62,0,0,140,394,0,0,157,0,1.2,1,0,2,1 +52,1,0,108,233,1,1,147,0,0.1,2,3,3,1 +43,1,2,130,315,0,1,162,0,1.9,2,1,2,1 +53,1,2,130,246,1,0,173,0,0,2,3,2,1 +42,1,3,148,244,0,0,178,0,0.8,2,2,2,1 +59,1,3,178,270,0,0,145,0,4.2,0,0,3,1 +63,0,1,140,195,0,1,179,0,0,2,2,2,1 +42,1,2,120,240,1,1,194,0,0.8,0,0,3,1 +50,1,2,129,196,0,1,163,0,0,2,0,2,1 +68,0,2,120,211,0,0,115,0,1.5,1,0,2,1 +69,1,3,160,234,1,0,131,0,0.1,1,1,2,1 +45,0,0,138,236,0,0,152,1,0.2,1,0,2,1 +50,0,1,120,244,0,1,162,0,1.1,2,0,2,1 +50,0,0,110,254,0,0,159,0,0,2,0,2,1 +64,0,0,180,325,0,1,154,1,0,2,0,2,1 +57,1,2,150,126,1,1,173,0,0.2,2,1,3,1 +64,0,2,140,313,0,1,133,0,0.2,2,0,3,1 +43,1,0,110,211,0,1,161,0,0,2,0,3,1 +55,1,1,130,262,0,1,155,0,0,2,0,2,1 +37,0,2,120,215,0,1,170,0,0,2,0,2,1 +41,1,2,130,214,0,0,168,0,2,1,0,2,1 +56,1,3,120,193,0,0,162,0,1.9,1,0,3,1 +46,0,1,105,204,0,1,172,0,0,2,0,2,1 +46,0,0,138,243,0,0,152,1,0,1,0,2,1 +64,0,0,130,303,0,1,122,0,2,1,2,2,1 +59,1,0,138,271,0,0,182,0,0,2,0,2,1 +41,0,2,112,268,0,0,172,1,0,2,0,2,1 +54,0,2,108,267,0,0,167,0,0,2,0,2,1 +39,0,2,94,199,0,1,179,0,0,2,0,2,1 +34,0,1,118,210,0,1,192,0,0.7,2,0,2,1 +47,1,0,112,204,0,1,143,0,0.1,2,0,2,1 +67,0,2,152,277,0,1,172,0,0,2,1,2,1 +52,0,2,136,196,0,0,169,0,0.1,1,0,2,1 +74,0,1,120,269,0,0,121,1,0.2,2,1,2,1 +54,0,2,160,201,0,1,163,0,0,2,1,2,1 +49,0,1,134,271,0,1,162,0,0,1,0,2,1 +42,1,1,120,295,0,1,162,0,0,2,0,2,1 +41,1,1,110,235,0,1,153,0,0,2,0,2,1 +41,0,1,126,306,0,1,163,0,0,2,0,2,1 +49,0,0,130,269,0,1,163,0,0,2,0,2,1 +60,0,2,120,178,1,1,96,0,0,2,0,2,1 +62,1,1,128,208,1,0,140,0,0,2,0,2,1 +57,1,0,110,201,0,1,126,1,1.5,1,0,1,1 +64,1,0,128,263,0,1,105,1,0.2,1,1,3,1 +51,0,2,120,295,0,0,157,0,0.6,2,0,2,1 +43,1,0,115,303,0,1,181,0,1.2,1,0,2,1 +42,0,2,120,209,0,1,173,0,0,1,0,2,1 +67,0,0,106,223,0,1,142,0,0.3,2,2,2,1 +76,0,2,140,197,0,2,116,0,1.1,1,0,2,1 +70,1,1,156,245,0,0,143,0,0,2,0,2,1 +44,0,2,118,242,0,1,149,0,0.3,1,1,2,1 +60,0,3,150,240,0,1,171,0,0.9,2,0,2,1 +44,1,2,120,226,0,1,169,0,0,2,0,2,1 +42,1,2,130,180,0,1,150,0,0,2,0,2,1 +66,1,0,160,228,0,0,138,0,2.3,2,0,1,1 +71,0,0,112,149,0,1,125,0,1.6,1,0,2,1 +64,1,3,170,227,0,0,155,0,0.6,1,0,3,1 +66,0,2,146,278,0,0,152,0,0,1,1,2,1 +39,0,2,138,220,0,1,152,0,0,1,0,2,1 +58,0,0,130,197,0,1,131,0,0.6,1,0,2,1 +47,1,2,130,253,0,1,179,0,0,2,0,2,1 +35,1,1,122,192,0,1,174,0,0,2,0,2,1 +58,1,1,125,220,0,1,144,0,0.4,1,4,3,1 +56,1,1,130,221,0,0,163,0,0,2,0,3,1 +56,1,1,120,240,0,1,169,0,0,0,0,2,1 +55,0,1,132,342,0,1,166,0,1.2,2,0,2,1 +41,1,1,120,157,0,1,182,0,0,2,0,2,1 +38,1,2,138,175,0,1,173,0,0,2,4,2,1 +38,1,2,138,175,0,1,173,0,0,2,4,2,1 +67,1,0,160,286,0,0,108,1,1.5,1,3,2,0 +67,1,0,120,229,0,0,129,1,2.6,1,2,3,0 +62,0,0,140,268,0,0,160,0,3.6,0,2,2,0 +63,1,0,130,254,0,0,147,0,1.4,1,1,3,0 +53,1,0,140,203,1,0,155,1,3.1,0,0,3,0 +56,1,2,130,256,1,0,142,1,0.6,1,1,1,0 +48,1,1,110,229,0,1,168,0,1,0,0,3,0 +58,1,1,120,284,0,0,160,0,1.8,1,0,2,0 +58,1,2,132,224,0,0,173,0,3.2,2,2,3,0 +60,1,0,130,206,0,0,132,1,2.4,1,2,3,0 +40,1,0,110,167,0,0,114,1,2,1,0,3,0 +60,1,0,117,230,1,1,160,1,1.4,2,2,3,0 +64,1,2,140,335,0,1,158,0,0,2,0,2,0 +43,1,0,120,177,0,0,120,1,2.5,1,0,3,0 +57,1,0,150,276,0,0,112,1,0.6,1,1,1,0 +55,1,0,132,353,0,1,132,1,1.2,1,1,3,0 +65,0,0,150,225,0,0,114,0,1,1,3,3,0 +61,0,0,130,330,0,0,169,0,0,2,0,2,0 +58,1,2,112,230,0,0,165,0,2.5,1,1,3,0 +50,1,0,150,243,0,0,128,0,2.6,1,0,3,0 +44,1,0,112,290,0,0,153,0,0,2,1,2,0 +60,1,0,130,253,0,1,144,1,1.4,2,1,3,0 +54,1,0,124,266,0,0,109,1,2.2,1,1,3,0 +50,1,2,140,233,0,1,163,0,0.6,1,1,3,0 +41,1,0,110,172,0,0,158,0,0,2,0,3,0 +51,0,0,130,305,0,1,142,1,1.2,1,0,3,0 +58,1,0,128,216,0,0,131,1,2.2,1,3,3,0 +54,1,0,120,188,0,1,113,0,1.4,1,1,3,0 +60,1,0,145,282,0,0,142,1,2.8,1,2,3,0 +60,1,2,140,185,0,0,155,0,3,1,0,2,0 +59,1,0,170,326,0,0,140,1,3.4,0,0,3,0 +46,1,2,150,231,0,1,147,0,3.6,1,0,2,0 +67,1,0,125,254,1,1,163,0,0.2,1,2,3,0 +62,1,0,120,267,0,1,99,1,1.8,1,2,3,0 +65,1,0,110,248,0,0,158,0,0.6,2,2,1,0 +44,1,0,110,197,0,0,177,0,0,2,1,2,0 +60,1,0,125,258,0,0,141,1,2.8,1,1,3,0 +58,1,0,150,270,0,0,111,1,0.8,2,0,3,0 +68,1,2,180,274,1,0,150,1,1.6,1,0,3,0 +62,0,0,160,164,0,0,145,0,6.2,0,3,3,0 +52,1,0,128,255,0,1,161,1,0,2,1,3,0 +59,1,0,110,239,0,0,142,1,1.2,1,1,3,0 +60,0,0,150,258,0,0,157,0,2.6,1,2,3,0 +49,1,2,120,188,0,1,139,0,2,1,3,3,0 +59,1,0,140,177,0,1,162,1,0,2,1,3,0 +57,1,2,128,229,0,0,150,0,0.4,1,1,3,0 +61,1,0,120,260,0,1,140,1,3.6,1,1,3,0 +39,1,0,118,219,0,1,140,0,1.2,1,0,3,0 +61,0,0,145,307,0,0,146,1,1,1,0,3,0 +56,1,0,125,249,1,0,144,1,1.2,1,1,2,0 +43,0,0,132,341,1,0,136,1,3,1,0,3,0 +62,0,2,130,263,0,1,97,0,1.2,1,1,3,0 +63,1,0,130,330,1,0,132,1,1.8,2,3,3,0 +65,1,0,135,254,0,0,127,0,2.8,1,1,3,0 +48,1,0,130,256,1,0,150,1,0,2,2,3,0 +63,0,0,150,407,0,0,154,0,4,1,3,3,0 +55,1,0,140,217,0,1,111,1,5.6,0,0,3,0 +65,1,3,138,282,1,0,174,0,1.4,1,1,2,0 +56,0,0,200,288,1,0,133,1,4,0,2,3,0 +54,1,0,110,239,0,1,126,1,2.8,1,1,3,0 +70,1,0,145,174,0,1,125,1,2.6,0,0,3,0 +62,1,1,120,281,0,0,103,0,1.4,1,1,3,0 +35,1,0,120,198,0,1,130,1,1.6,1,0,3,0 +59,1,3,170,288,0,0,159,0,0.2,1,0,3,0 +64,1,2,125,309,0,1,131,1,1.8,1,0,3,0 +47,1,2,108,243,0,1,152,0,0,2,0,2,0 +57,1,0,165,289,1,0,124,0,1,1,3,3,0 +55,1,0,160,289,0,0,145,1,0.8,1,1,3,0 +64,1,0,120,246,0,0,96,1,2.2,0,1,2,0 +70,1,0,130,322,0,0,109,0,2.4,1,3,2,0 +51,1,0,140,299,0,1,173,1,1.6,2,0,3,0 +58,1,0,125,300,0,0,171,0,0,2,2,3,0 +60,1,0,140,293,0,0,170,0,1.2,1,2,3,0 +77,1,0,125,304,0,0,162,1,0,2,3,2,0 +35,1,0,126,282,0,0,156,1,0,2,0,3,0 +70,1,2,160,269,0,1,112,1,2.9,1,1,3,0 +59,0,0,174,249,0,1,143,1,0,1,0,2,0 +64,1,0,145,212,0,0,132,0,2,1,2,1,0 +57,1,0,152,274,0,1,88,1,1.2,1,1,3,0 +56,1,0,132,184,0,0,105,1,2.1,1,1,1,0 +48,1,0,124,274,0,0,166,0,0.5,1,0,3,0 +56,0,0,134,409,0,0,150,1,1.9,1,2,3,0 +66,1,1,160,246,0,1,120,1,0,1,3,1,0 +54,1,1,192,283,0,0,195,0,0,2,1,3,0 +69,1,2,140,254,0,0,146,0,2,1,3,3,0 +51,1,0,140,298,0,1,122,1,4.2,1,3,3,0 +43,1,0,132,247,1,0,143,1,0.1,1,4,3,0 +62,0,0,138,294,1,1,106,0,1.9,1,3,2,0 +67,1,0,100,299,0,0,125,1,0.9,1,2,2,0 +59,1,3,160,273,0,0,125,0,0,2,0,2,0 +45,1,0,142,309,0,0,147,1,0,1,3,3,0 +58,1,0,128,259,0,0,130,1,3,1,2,3,0 +50,1,0,144,200,0,0,126,1,0.9,1,0,3,0 +62,0,0,150,244,0,1,154,1,1.4,1,0,2,0 +38,1,3,120,231,0,1,182,1,3.8,1,0,3,0 +66,0,0,178,228,1,1,165,1,1,1,2,3,0 +52,1,0,112,230,0,1,160,0,0,2,1,2,0 +53,1,0,123,282,0,1,95,1,2,1,2,3,0 +63,0,0,108,269,0,1,169,1,1.8,1,2,2,0 +54,1,0,110,206,0,0,108,1,0,1,1,2,0 +66,1,0,112,212,0,0,132,1,0.1,2,1,2,0 +55,0,0,180,327,0,2,117,1,3.4,1,0,2,0 +49,1,2,118,149,0,0,126,0,0.8,2,3,2,0 +54,1,0,122,286,0,0,116,1,3.2,1,2,2,0 +56,1,0,130,283,1,0,103,1,1.6,0,0,3,0 +46,1,0,120,249,0,0,144,0,0.8,2,0,3,0 +61,1,3,134,234,0,1,145,0,2.6,1,2,2,0 +67,1,0,120,237,0,1,71,0,1,1,0,2,0 +58,1,0,100,234,0,1,156,0,0.1,2,1,3,0 +47,1,0,110,275,0,0,118,1,1,1,1,2,0 +52,1,0,125,212,0,1,168,0,1,2,2,3,0 +58,1,0,146,218,0,1,105,0,2,1,1,3,0 +57,1,1,124,261,0,1,141,0,0.3,2,0,3,0 +58,0,1,136,319,1,0,152,0,0,2,2,2,0 +61,1,0,138,166,0,0,125,1,3.6,1,1,2,0 +42,1,0,136,315,0,1,125,1,1.8,1,0,1,0 +52,1,0,128,204,1,1,156,1,1,1,0,0,0 +59,1,2,126,218,1,1,134,0,2.2,1,1,1,0 +40,1,0,152,223,0,1,181,0,0,2,0,3,0 +61,1,0,140,207,0,0,138,1,1.9,2,1,3,0 +46,1,0,140,311,0,1,120,1,1.8,1,2,3,0 +59,1,3,134,204,0,1,162,0,0.8,2,2,2,0 +57,1,1,154,232,0,0,164,0,0,2,1,2,0 +57,1,0,110,335,0,1,143,1,3,1,1,3,0 +55,0,0,128,205,0,2,130,1,2,1,1,3,0 +61,1,0,148,203,0,1,161,0,0,2,1,3,0 +58,1,0,114,318,0,2,140,0,4.4,0,3,1,0 +58,0,0,170,225,1,0,146,1,2.8,1,2,1,0 +67,1,2,152,212,0,0,150,0,0.8,1,0,3,0 +44,1,0,120,169,0,1,144,1,2.8,0,0,1,0 +63,1,0,140,187,0,0,144,1,4,2,2,3,0 +63,0,0,124,197,0,1,136,1,0,1,0,2,0 +59,1,0,164,176,1,0,90,0,1,1,2,1,0 +57,0,0,140,241,0,1,123,1,0.2,1,0,3,0 +45,1,3,110,264,0,1,132,0,1.2,1,0,3,0 +68,1,0,144,193,1,1,141,0,3.4,1,2,3,0 +57,1,0,130,131,0,1,115,1,1.2,1,1,3,0 +57,0,1,130,236,0,0,174,0,0,1,1,2,0 diff --git a/python_clustering/data/polyps.csv b/python_clustering/data/polyps.csv new file mode 100644 index 000000000..54f327919 --- /dev/null +++ b/python_clustering/data/polyps.csv @@ -0,0 +1,23 @@ +"","participant_id","sex","age","baseline","treatment","number3m","number12m" +"1","001","female",17,7,"sulindac",6,NA +"2","002","female",20,77,"placebo",67,63 +"3","003","male",16,7,"sulindac",4,2 +"4","004","female",18,5,"placebo",5,28 +"5","005","male",22,23,"sulindac",16,17 +"6","006","female",13,35,"placebo",31,61 +"7","007","female",23,11,"sulindac",6,1 +"8","008","male",34,12,"placebo",20,7 +"9","009","male",50,7,"placebo",7,15 +"10","010","male",19,318,"placebo",347,44 +"11","011","male",17,160,"sulindac",142,25 +"12","012","female",23,8,"sulindac",1,3 +"13","013","male",22,20,"placebo",16,28 +"14","014","male",30,11,"placebo",20,10 +"15","015","male",27,24,"placebo",26,40 +"16","016","male",23,34,"sulindac",27,33 +"17","017","female",22,54,"placebo",45,46 +"18","018","male",13,16,"sulindac",10,NA +"19","019","male",34,30,"placebo",30,50 +"20","020","female",23,10,"sulindac",6,3 +"21","021","female",22,20,"sulindac",5,1 +"22","022","male",42,12,"sulindac",8,4 diff --git a/python_clustering/python_clustering.md b/python_clustering/python_clustering.md new file mode 100644 index 000000000..5c9fd0aa6 --- /dev/null +++ b/python_clustering/python_clustering.md @@ -0,0 +1,477 @@ + + +# Clustering in Python + +@overview + + + +## Review of Clustering + +**Clustering** is a machine learning technique used to group unlabeled data points into clusters based on their similarity. The goal is to identify groups of data points that are similar to each other and dissimilar to data points in other groups. In this lesson we will work through an example of K-Means clustering. Other common algorithms hierarchical clustering, and Gaussian Mixture Models. + +For a more in-depth look at what clustering is, see the [_other clustering module_](link). + +Clustering is a type of **unsupervised learning**. Unsupervised learning algorithms are algorithms trained on unlabeled data to identify patterns and relationships without prior knowledge. This is different from supervised learning, where an algorithm is initially trained on labeled data in order to predict labels for new data points. + +- **Applications:** Clustering finds applications in various fields such as customer segmentation, biomedical research, drug development, gene expression analysis, medical image analysis, and disease-risk prediction. + + + +- **Understanding Techniques:** Techniques like normalization, computing distances from cluster centroids, and visualization aid in building accurate clustering models and interpreting results. + +- **Challenges and Limitations:** Challenges include sensitivity to initialization, difficulty in choosing the number of clusters, handling outliers, and interpreting results in high-dimensional data. + +- **Mitigating Sensitivity:** Techniques like running the algorithm multiple times with different initializations, using robust algorithms, and preprocessing data help mitigate sensitivity to initialization. + +- **Conclusion:** Clustering is a powerful tool with diverse applications, but it's essential to understand its limitations and challenges. With foundational knowledge in clustering techniques, one can explore advanced methods and make informed decisions in data analysis and machine learning endeavors. + + + +## The K-Means Clustering Algorithm + +The **K-Means Clustering Algorithm**, sometimes refered to as simply "K-Means," works by iteratively assigning data points to clusters based on their distance to cluster centroids. + +The key steps of K-Means clustering are: +1. choosing the number of clusters (K), +2. initializing centroids, assigning data points, +3. recalculating centroids, and +4. iterating until convergence. + +### Clustering Patients + +We are going to use use an dataset ***(add more details about its origin)*** +This dataset contains various clinical attributes of patients, including their age, sex, chest pain type (cp), resting blood pressure (trtbps), serum cholesterol level (chol), fasting blood sugar (fbs) level, resting electrocardiographic results (restecg), maximum heart rate achieved (thalachh), exercise-induced angina (exng), ST depression induced by exercise relative to rest (oldpeak), slope of the peak exercise ST segment (slp), number of major vessels (caa) colored by fluoroscopy, thalassemia (thall) type, and the presence of heart disease (output). The data seems to be related to the diagnosis of heart disease, with the output variable indicating whether a patient has heart disease (1) or not (0). Each row represents a different patient, with their respective clinical characteristics recorded. + +To implement k-means clustering in Python using Scikit-learn, we can follow these steps: + +### 1. Import Libraries + +**Description:** +This step imports essential libraries needed for data manipulation, analysis, and visualization, as well as the KMeans clustering algorithm. + +* **numpy (np):** This library provides tools for numerical operations and working with arrays, which are essential for data manipulation in machine learning. +* **pandas (pd):** Pandas is used for data analysis and manipulation, especially with tabular data. It makes it easy to load, clean, and organize your data. +* **matplotlib.pyplot (plt):** Matplotlib is a powerful plotting library for creating graphs and visualizations. We'll use it to visualize our data and clustering results. +* **sklearn.model_selection (train_test_split):** We'll use this function later if we need to split our data into training and testing sets for model evaluation. +* **sklearn.cluster (KMeans):** This is where the heart of our clustering algorithm lies. KMeans is the specific algorithm we'll use to group our data into clusters. +* **scipy.spatial (distance):** Scipy is a broader scientific computing library. The distance module provides functions to calculate distances between points, which we'll use in our KMeans analysis. + +**Why is it important:** +These libraries provide the foundational tools and functions required to perform data preprocessing, clustering, and visualization. Without them, we wouldn't be able to efficiently handle the data or perform the clustering analysis. + +```python +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.cluster import KMeans +from scipy.spatial import distance +``` +@Pyodide.eval + +**Output:** +There is no direct output for this step, as it is focused on importing necessary libraries. However, successful execution without errors indicates that the libraries are correctly imported and ready for use. + + + + +### 2. Loading the Data + +**Description:** +This step involves loading the patient data from a CSV file into a Pandas DataFrame and then examining the structure of the data. + +* `data = pd.read_csv(file)`: This line reads the CSV (Comma-Separated Values) file, which presumably contains your patient data, into a Pandas DataFrame called `data`. DataFrames are like tables, where each row represents a patient, and each column represents a feature (e.g., age, cholesterol). +* `data.info()`: This function gives you a summary of the DataFrame, showing the column names, their data types, and how many non-null values are in each column. This helps you understand the structure of your data. + +**Why it's important:** +Understanding the structure of your data is crucial before performing any data manipulation or analysis. It helps identify any missing values, understand data types, and get a general overview of the dataset. + + +```python @Pyodide.exec + +import pandas as pd +import io +from pyodide.http import open_url + +url = "https://raw.githubusercontent.com/arcus/education_modules/python_clustering/python_clustering/data/heart.csv" + +url_contents = open_url(url) +text = url_contents.read() +file = io.StringIO(text) + +data = pd.read_csv(file) + + +# Analyze data and features +data.info() +``` + + +**Output:** + +`data.info()` gives a summary of the DataFrame, including the number of non-null entries for each column and their data types. +`print(data.head())` displays the first few rows of the DataFrame to give learners a feel for what the data looks like. + + + +### 3. Visualize Data +**Description:** This code generates a scatter plot with `chol` (Cholesterol) on the x-axis and `trtbps` (Resting Blood Pressure) on the y-axis. The data points are colored based on the `output` column, using the `viridis` colormap. Labels and a title are added, and then the plot is displayed. + + +**Why it's important:** +Understanding the structure of your data is crucial before performing any data manipulation or analysis. It helps identify any missing values, understand data types, and get a general overview of the dataset. + +```python +# Create the scatter plot +data.plot.scatter(x='chol', y='trtbps', c='output', colormap='viridis') +plt.xlabel("Cholesterol") +plt.ylabel("Resting Blood Pressure") +plt.title("Scatter Plot of Cholesterol vs. Blood Pressure") +plt.show() +``` +@Pyodide.eval + + +**Output:** +By adding the `print(data.head())` statement, you can see the first few rows of the data, which helps understand the dataset's structure and the columns being used in the plot. + + +### 4. Normalize DataFrame + +**Description:** + +* The function `normalize(df, features)` is defined to perform min-max normalization of the features listed in `features` within the DataFrame `df`. It creates a copy `result` of the DataFrame and iterates over each feature to scale its values to the range [0, 1]. The normalized DataFrame `result` is returned. +* The `normalize` function is then applied to the `data` DataFrame to normalize all columns, and the results are stored in `normalized_data`. + +**Why it is important:** +Normalization is crucial because it scales the data to a common range without distorting differences in the ranges of values. This ensures that no single feature dominates the clustering algorithm due to its scale, leading to more meaningful and comparable results. + + +```python +# Normalize dataframe +def normalize(df, features): + # Create a copy of the DataFrame to avoid modifying the original data. + result = df.copy() + + # Iterate through each feature specified for normalization. + for feature_name in features: + # Find the maximum and minimum values of the current feature. + max_value = df[feature_name].max() + min_value = df[feature_name].min() + + # Normalize the current feature using min-max scaling formula. + # This ensures that all values of the feature are scaled between 0 and 1. + result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value) + return result + +# Call the normalize function with the entire DataFrame 'data' and all its columns. +# Store the result in 'normalized_data'. +normalized_data = normalize(data, data.columns) + +# Print the normalized data to see the transformed values +print(normalized_data) +``` +@Pyodide.eval + +**Output:** +This code performs min-max normalization on the dataset and prints the resulting normalized_data. The output will show the scaled values of each feature, ensuring that all values are between 0 and 1. This step is critical for ensuring that the clustering algorithm treats each feature equally. + + + + + +### 5. Run KMeans +**Description:** This line creates a KMeans object. + +**Why this is important:** +The KMeans algorithm is a popular clustering method that partitions the data into distinct groups (clusters) based on feature similarity. By configuring the parameters, we can control the behavior of the algorithm and ensure consistent results. + + * `n_clusters = 2` tells KMeans to find two clusters in your data. + * `max_iter = 500` sets a maximum of 500 iterations for the algorithm to converge. + * `n_init = 40` means the algorithm will be run 40 times with different random initializations, and the best result will be chosen. + * `random_state = 2` ensures reproducibility; you'll get the same clustering results each time you run the code. + + +```python +# Create KMeans object +kmeans = KMeans(n_clusters=2, max_iter=500, n_init=40, random_state=2) +print("KMeans object created with the following parameters:") +print(f"Number of clusters: {kmeans.n_clusters}") +print(f"Maximum iterations: {kmeans.max_iter}") +print(f"Number of initializations: {kmeans.n_init}") +print(f"Random state: {kmeans.random_state}") + +``` +@Pyodide.eval + +**Output:** +Since the KMeans object creation itself does not produce output, the impact of this step will be evident in the following steps where we fit the model and predict clusters. + + + + + +### 6. Predict Clusters +**Description:** + +* `kmeans.fit_predict()` does two things: + + 1. It fits the KMeans model to your normalized data, meaning it finds the cluster centers. + 2. It predicts which cluster each data point belongs to, returning an array `identified_clusters` where each element corresponds to the cluster assignment of a data point. + +* We create a copy `results` of the `normalized_data` and add a new column `cluster` to it, storing the identified cluster labels. + +**Why this is important:** +Fitting the KMeans model to the data and predicting clusters are crucial steps in the clustering process. By assigning each data point to a cluster, we can analyze patterns and group similar data points together. This can reveal underlying structures in the data and help in further analysis or decision-making processes. + + +```python +# Fit the KMeans model to the normalized data and predict the clusters +identified_clusters = kmeans.fit_predict(normalized_data.values) + +# Create a copy of the normalized data to store the results +results = normalized_data.copy() + +# Add the identified cluster labels as a new column 'cluster' in the results DataFrame +results['cluster'] = identified_clusters + +# Print the results to observe the DataFrame with the cluster assignments +print(results.head()) +``` +@Pyodide.eval + +**Output:** +The output will be a preview of the first few rows of the results DataFrame, which now includes the original normalized data along with the new cluster column. In this output: + +* Each row corresponds to a data point (e.g., a patient's data in a medical dataset). +* The columns represent the normalized features (e.g., age, sex, cp, etc.). +* The cluster column indicates the cluster assignment for each data point, with values such as 0 or 1 representing different clusters. +* This output allows shows how their data points have been grouped into clusters based on the KMeans algorithm. + + + + +### 7. Compute Distance from Cluster Centroid +**Description:** This line calculates the Euclidean distance between each data point and its assigned cluster centroid. This distance is stored in the list `distance_from_centroid` and added as a new column `dist` in the results DataFrame. + +**Why this is important:** +Computing the distance from each data point to its cluster centroid provides insight into how well the data points are clustered around their centroids. It helps assess the compactness of clusters and can be useful for evaluating the quality of the clustering. + +```python +# Calculate the Euclidean distance between each data point and its assigned cluster centroid +distance_from_centroid = [distance.euclidean(val[:-1], kmeans.cluster_centers_[int(val[-1])]) for val in results.values] + +# Add the computed distances as a new column 'dist' in the results DataFrame +results['dist'] = distance_from_centroid + +# Print the results to observe the DataFrame with the distance values +print(results.head()) +``` +@Pyodide.eval + +**Output:** +The output will display the first few rows of the results DataFrame with the newly added dist column, representing the distances of each data point from its assigned cluster centroid. This output allows learners to understand how the distances are calculated and see the impact of the clustering on the data. + + +### 8. Train the clustering model and visualize +**Description:** Creates a scatter plot of `chol` (Cholesterol) against `trtbps` (Resting Blood Pressure), colored by the identified clusters, with marker size proportional to the distance from the cluster centroid. + +**Why this is important:** +Visualization is crucial for understanding clustering results. By plotting the data points with identified clusters, we can visually inspect how well the clustering algorithm has grouped similar data points together. Additionally, using marker size to represent the distance from the cluster centroid provides insights into the compactness of each cluster. + +```python +results.plot.scatter(x='chol', y='trtbps', c='cluster', colormap='viridis', s='dist') +plt.xlabel("Cholesterol") +plt.ylabel("Resting Blood Pressure") +plt.show() +``` +@Pyodide.eval + +**Output:** +The output is a scatter plot where each data point is represented by a marker. The markers are colored based on the identified clusters, and their sizes vary depending on the distance from the cluster centroid. This visualization allows learners to visually inspect how the data points are grouped into clusters and how compact each cluster is. + + + +## Review your knowledge + +```python +from sklearn.cluster import KMeans + +# Create a KMeans instance with ____ clusters +kmeans = KMeans(n_clusters=____) + +# Fit the model to the data +kmeans.fit(____) + +# Get the cluster centroids +centroids = kmeans.cluster_centers_ + +# Predict the cluster labels for the data points +labels = kmeans.predict(____) +``` + + +Fill in the blanks to implement the K-Means clustering algorithm in Python: + +[( )] `k`, `k`, `X` +[( )] `n_clusters`, `K`, `data` +[(X)] `K`, `n_clusters`, `data` +[( )] `data`, `n_clusters`, `K` +*** +
+ +This question tests your understanding of implementing the K-Means clustering algorithm using the scikit-learn library in Python. To answer correctly, you need to identify the correct placeholders for the number of clusters and the dataset in the code snippet. The correct option, "K, n_clusters, data," corresponds to the appropriate parameters and function calls required for the K-Means algorithm. + +
+ + + + +## Conclusion + +Through this lesson, you've gained a solid foundation in clustering, a cornerstone of unsupervised machine learning. You've learned how the K-Means algorithm works, its strengths and limitations, and most importantly, how to harness it within Python's powerful data science ecosystem. + +### Key Takeaways +Here's a summary of key takeaways to keep in mind: + +* **Clustering Unveils Hidden Structures:** K-Means can reveal meaningful groupings within your data that might not be immediately apparent. This is crucial for tasks like customer segmentation, anomaly detection, and even preliminary exploration before applying more complex models. +* **Real-World Applications Abound:** Clustering isn't just theoretical. We've seen how it can be used in medical diagnostics (predicting heart disease risk based on patient attributes) and in pharmaceutical research (identifying patient subgroups responding differently to treatments). This demonstrates the algorithm's versatility across domains. +* **Data Preprocessing is Key:** The quality of your clustering results depends heavily on how you prepare your data. Normalization and feature scaling are often essential steps to ensure that all features contribute equally to the clustering process. +* **K-Means Isn't Perfect:** Remember that K-Means has its limitations. It assumes clusters are spherical and of equal size, which isn't always the case in real-world data. Additionally, choosing the optimal number of clusters (K) requires careful consideration and experimentation. + + +### Beyond K-Means +While K-Means is a great starting point, the world of clustering is vast. As you progress in your machine learning journey, you'll encounter more sophisticated algorithms like DBSCAN, hierarchical clustering, and Gaussian mixture models. Each has its own strengths and use cases. + +Consider exploring these areas to expand your clustering toolkit: + +* **Dimensionality Reduction:** Techniques like PCA can help visualize high-dimensional clustered data. +* **Cluster Evaluation:** Learn metrics like silhouette score to assess the quality of your clusters objectively. +* **Ensemble Clustering:** Combining multiple clustering algorithms can often lead to more robust and accurate results. + +The knowledge you've gained here equips you to tackle a wide range of data analysis challenges. By applying clustering thoughtfully and critically, you can unlock valuable insights and drive data-driven decision-making. + +## Additional Resources + +### Full Code Implementation + +At the end of this module, here you will find a "Full Code" section where all the code is consolidated into a single cell block. This allows for easy copying and pasting for those who want to implement the entire process quickly. While this single block of code isn't designed as a step-by-step educational tool, it serves as a convenient reference for future use and helps streamline the process for those already familiar with the concepts. Below is the complete code implementation: + +```python +# Import Libraries +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.cluster import KMeans +from scipy.spatial import distance +import io +from pyodide.http import open_url + +# Load Data +url = "https://raw.githubusercontent.com/arcus/education_modules/python_clustering/python_clustering/data/heart.csv" +url_contents = open_url(url) +text = url_contents.read() +file = io.StringIO(text) +data = pd.read_csv(file) +data.info() + +# Visualize Data +data.plot.scatter(x='chol', y='trtbps', c='output', colormap='viridis') +plt.xlabel("Cholesterol") +plt.ylabel("Resting Blood Pressure") +plt.title("Scatter Plot of Cholesterol vs. Blood Pressure") +plt.show() + +# Normalize DataFrame +def normalize(df, features): + result = df.copy() + for feature_name in features: + max_value = df[feature_name].max() + min_value = df[feature_name].min() + result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value) + return result + +normalized_data = normalize(data, data.columns) + +# Run KMeans +kmeans = KMeans(n_clusters = 2, max_iter = 500, n_init = 40, random_state = 2) + +# Predict Clusters +identified_clusters = kmeans.fit_predict(normalized_data.values) +results = normalized_data.copy() +results['cluster'] = identified_clusters + +# Compute Distance from Cluster Centroid +distance_from_centroid = [distance.euclidean(val[:-1], kmeans.cluster_centers_[int(val[-1])]) for val in results.values] +results['dist'] = distance_from_centroid + +# Visualize Clusters +results.plot.scatter(x='chol', y='trtbps', c='cluster', colormap='viridis', s='dist') +plt.xlabel("Cholesterol") +plt.ylabel("Resting Blood Pressure") +plt.show() +``` +@Pyodide.eval + +## Feedback + +@feedback diff --git a/python_clustering/python_clustering_exercise.ipynb b/python_clustering/python_clustering_exercise.ipynb new file mode 100644 index 000000000..8433e3130 --- /dev/null +++ b/python_clustering/python_clustering_exercise.ipynb @@ -0,0 +1,200 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Introduction" + ], + "metadata": { + "id": "JPi5A3zdglwx" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Real World Code Example: Analyzing Polyp Progression in FAP Patients**\n", + "\n", + "This notebook investigates the effectiveness of sulindac treatment in individuals with familial adenomatous polyposis (FAP) using a refined dataset based on a landmark study published in the New England Journal of Medicine in 1993. We'll use K-Means clustering to explore potential subgroups of patients based on their polyp progression over time.\n", + "\n", + "**Key Variables:**\n", + "\n", + "* `age`: Patient's age\n", + "* `baseline`: Baseline polyp count\n", + "* `number3m`: Polyp count at 3 months post-treatment\n", + "* `number12m`: Polyp count at 12 months post-treatment" + ], + "metadata": { + "id": "sZSi_qPNgomr" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Install and Import" + ], + "metadata": { + "id": "kemfT40hg05U" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import io\n", + "from pyodide.http import open_url\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.preprocessing import StandardScaler\n", + "import matplotlib.pyplot as plt" + ], + "metadata": { + "id": "jb2mmZaPhANS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Load and Prepare Data" + ], + "metadata": { + "id": "Qls_4siSg2uu" + } + }, + { + "cell_type": "code", + "source": [ + "# Load data from GitHub\n", + "url = \"https://raw.githubusercontent.com/arcus/education_modules/python_clustering/python_clustering/data/polyps.csv\"\n", + "url_contents = open_url(url)\n", + "text = url_contents.read()\n", + "file = io.StringIO(text)\n", + "df = pd.read_csv(file)\n", + "\n", + "# Print data information\n", + "print(df.info())\n", + "\n", + "# Select features for clustering\n", + "features = ['age', 'baseline', 'number3m', 'number12m']\n", + "X = df[features]\n", + "\n", + "# Fill missing values with the mean\n", + "X.fillna(X.mean(), inplace=True)\n", + "\n", + "# Standardize features\n", + "scaler = StandardScaler()\n", + "X_scaled = scaler.fit_transform(X)" + ], + "metadata": { + "id": "B5HVt_INhB9J" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Cluster Data" + ], + "metadata": { + "id": "z4yBbkVVg4cW" + } + }, + { + "cell_type": "code", + "source": [ + "# Define number of clusters\n", + "num_clusters = 3\n", + "\n", + "# Apply K-Means clustering\n", + "kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n", + "kmeans.fit(X_scaled)\n", + "\n", + "# Assign cluster labels\n", + "df['cluster'] = kmeans.labels_" + ], + "metadata": { + "id": "7-VYpcyXhDaq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Visualize Clusters" + ], + "metadata": { + "id": "fEdnjjTlg6Pe" + } + }, + { + "cell_type": "code", + "source": [ + "# Visualize clusters\n", + "plt.figure(figsize=(10, 8))\n", + "colors = ['red', 'blue', 'green']\n", + "\n", + "for i in range(num_clusters):\n", + " cluster_data = df[df['cluster'] == i]\n", + " plt.scatter(cluster_data['number3m'], cluster_data['number12m'],\n", + " color=colors[i], label=f'Cluster {i}')\n", + "\n", + "plt.xlabel('Number of Polyps at 3 Months')\n", + "plt.ylabel('Number of Polyps at 12 Months')\n", + "plt.title('K-Means Clustering: Polyp Progression (3 vs. 12 Months)')\n", + "plt.legend()\n", + "plt.show()" + ], + "metadata": { + "id": "HUAP2uG0hE7j" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Interpretation and Further Analysis" + ], + "metadata": { + "id": "TesjSE05g8Ea" + } + }, + { + "cell_type": "markdown", + "source": [ + "The K-Means clustering results suggest potential subgroups of patients based on polyp progression patterns:\n", + "\n", + "* **Cluster 1 (Low Progression):** Potentially stable or slow polyp growth.\n", + "* **Cluster 2 (Moderate Progression):** Some increase in polyps over time.\n", + "* **Cluster 3 (High Progression):** Substantial increase in polyps over time.\n", + "\n", + "**Further Analysis (Not Shown):**\n", + "* Investigate differences in treatment (sulindac vs. placebo) between clusters.\n", + "* Explore other patient characteristics (e.g., age, sex) within each cluster.\n", + "* Consider alternative clustering methods or a different number of clusters.\n", + "\n", + "\n", + "Remember, clustering is exploratory. Additional analysis is needed to confirm these patterns and understand the underlying factors influencing polyp progression." + ], + "metadata": { + "id": "XuwxBwNshH5r" + } + } + ] +} \ No newline at end of file