spark手把手:[e2-spk-s02]

157
1

Upload: erhwen-kuo

Post on 14-Apr-2017

212 views

Category:

Engineering


2 download

TRANSCRIPT

Page 1: Spark手把手:[e2-spk-s02]

1

Page 2: Spark手把手:[e2-spk-s02]

2 . 1

Page 7: Spark手把手:[e2-spk-s02]

dockerrun-v/home/etadm/docker/spark/e2spkv01:/home:rw\-p8088:8088-p8042:8042\-hsandbox\-itsequenceiq/spark:1.6.0bash

2 . 5

Page 8: Spark手把手:[e2-spk-s02]

spark-shell\--masterlocal\--jars/home/e2-spk-s02/jars/spark-csv_2.10-1.4.0.jar\,/home/e2-spk-s02/jars/commons-csv-1.1.jar

2 . 6

Page 9: Spark手把手:[e2-spk-s02]

3

Page 10: Spark手把手:[e2-spk-s02]

3

Page 11: Spark手把手:[e2-spk-s02]

3

Page 12: Spark手把手:[e2-spk-s02]

3

Page 13: Spark手把手:[e2-spk-s02]

3

Page 14: Spark手把手:[e2-spk-s02]

3

Page 15: Spark手把手:[e2-spk-s02]

4 . 1

Page 16: Spark手把手:[e2-spk-s02]

4 . 2

Page 17: Spark手把手:[e2-spk-s02]

4 . 2

Page 18: Spark手把手:[e2-spk-s02]

4 . 2

Page 19: Spark手把手:[e2-spk-s02]

4 . 2

Page 20: Spark手把手:[e2-spk-s02]

4 . 2

Page 21: Spark手把手:[e2-spk-s02]

4 . 2

Page 22: Spark手把手:[e2-spk-s02]

4 . 2

Page 23: Spark手把手:[e2-spk-s02]

4 . 2

Page 24: Spark手把手:[e2-spk-s02]

4 . 3

Page 25: Spark手把手:[e2-spk-s02]

4 . 3

Page 26: Spark手把手:[e2-spk-s02]

4 . 3

Page 27: Spark手把手:[e2-spk-s02]

4 . 3

Page 28: Spark手把手:[e2-spk-s02]

4 . 4

Page 29: Spark手把手:[e2-spk-s02]

4 . 4

Page 30: Spark手把手:[e2-spk-s02]

4 . 4

Page 31: Spark手把手:[e2-spk-s02]

4 . 5

Page 32: Spark手把手:[e2-spk-s02]

4 . 6

Page 33: Spark手把手:[e2-spk-s02]

4 . 7

Page 34: Spark手把手:[e2-spk-s02]

4 . 8

Page 35: Spark手把手:[e2-spk-s02]

4 . 9

Page 36: Spark手把手:[e2-spk-s02]

4 . 10

Page 37: Spark手把手:[e2-spk-s02]

4 . 11

Page 38: Spark手把手:[e2-spk-s02]

4 . 12

Page 39: Spark手把手:[e2-spk-s02]

4 . 13

Page 40: Spark手把手:[e2-spk-s02]

4 . 14

Page 41: Spark手把手:[e2-spk-s02]

4 . 15

Page 42: Spark手把手:[e2-spk-s02]

4 . 16

Page 43: Spark手把手:[e2-spk-s02]

4 . 17

Page 44: Spark手把手:[e2-spk-s02]

4 . 18

Page 45: Spark手把手:[e2-spk-s02]

4 . 19

Page 46: Spark手把手:[e2-spk-s02]

4 . 19

Page 47: Spark手把手:[e2-spk-s02]

5 . 1

Page 48: Spark手把手:[e2-spk-s02]

5 . 2

Page 49: Spark手把手:[e2-spk-s02]

//CreateaobjectcontainercaseclassWord(text:String)valfileName="README.md"valdocs=sc.textFile(fileName)vallower=docs.map(line=>line.toLowerCase())valwords=lower.flatMap(line=>line.split("\\s+"))//ConvertRDDtoDataframeusing"Caseclass"valwords_df=words.map(Word(_)).toDF()words_df.registerTempTable("words")//Registerasa[TABLE]valtopWords=sqlContext.sql("SELECTtext,count(text)ASnFROMwordsGROUPBYtextORDERBYnDESCLIMIT10")topWords.foreach(println)

5 . 3

Page 50: Spark手把手:[e2-spk-s02]

5 . 4

Page 51: Spark手把手:[e2-spk-s02]

5 . 5

Page 52: Spark手把手:[e2-spk-s02]

5 . 6

Page 53: Spark手把手:[e2-spk-s02]

5 . 7

Page 54: Spark手把手:[e2-spk-s02]

5 . 8

Page 55: Spark手把手:[e2-spk-s02]

5 . 9

Page 56: Spark手把手:[e2-spk-s02]

5 . 10

Page 57: Spark手把手:[e2-spk-s02]

5 . 11

Page 58: Spark手把手:[e2-spk-s02]

5 . 12

Page 59: Spark手把手:[e2-spk-s02]

5 . 13

Page 60: Spark手把手:[e2-spk-s02]

5 . 14

Page 61: Spark手把手:[e2-spk-s02]

5 . 15

Page 62: Spark手把手:[e2-spk-s02]

5 . 16

Page 63: Spark手把手:[e2-spk-s02]

5 . 17

Page 64: Spark手把手:[e2-spk-s02]

5 . 18

Page 65: Spark手把手:[e2-spk-s02]

caseclassWord(text:String)valfileName="README.md"valdocs=sc.textFile(fileName)vallower=docs.map(line=>line.toLowerCase())valwords=lower.flatMap(line=>line.split("\\s+"))valwords_df=words.map(Word(_)).toDF()words_df.registerTempTable("words")valtopWords=sqlContext.sql("SELECTtext,count(text)ASnFROMwordsGROUPBYtextORDERBYnDESCLIMIT10"topWords.foreach(println)

5 . 19

Page 66: Spark手把手:[e2-spk-s02]

5 . 20

Page 67: Spark手把手:[e2-spk-s02]

6 . 1

Page 68: Spark手把手:[e2-spk-s02]

6 . 2

Page 69: Spark手把手:[e2-spk-s02]

6 . 2

Page 70: Spark手把手:[e2-spk-s02]

6 . 2

Page 71: Spark手把手:[e2-spk-s02]

6 . 3

Page 72: Spark手把手:[e2-spk-s02]

6 . 4

Page 73: Spark手把手:[e2-spk-s02]

6 . 5

Page 74: Spark手把手:[e2-spk-s02]

6 . 6

Page 75: Spark手把手:[e2-spk-s02]

6 . 6

Page 76: Spark手把手:[e2-spk-s02]

6 . 6

Page 77: Spark手把手:[e2-spk-s02]

6 . 6

Page 78: Spark手把手:[e2-spk-s02]

6 . 6

Page 79: Spark手把手:[e2-spk-s02]

6 . 6

Page 80: Spark手把手:[e2-spk-s02]

6 . 7

Page 81: Spark手把手:[e2-spk-s02]

6 . 7

Page 82: Spark手把手:[e2-spk-s02]

6 . 7

Page 83: Spark手把手:[e2-spk-s02]

6 . 7

Page 84: Spark手把手:[e2-spk-s02]

6 . 7

Page 85: Spark手把手:[e2-spk-s02]

6 . 7

Page 86: Spark手把手:[e2-spk-s02]

6 . 8

Page 87: Spark手把手:[e2-spk-s02]

6 . 8

Page 88: Spark手把手:[e2-spk-s02]

6 . 8

Page 89: Spark手把手:[e2-spk-s02]

6 . 9

Page 90: Spark手把手:[e2-spk-s02]

6 . 9

Page 91: Spark手把手:[e2-spk-s02]

6 . 9

Page 92: Spark手把手:[e2-spk-s02]

6 . 9

Page 93: Spark手把手:[e2-spk-s02]

6 . 9

Page 94: Spark手把手:[e2-spk-s02]

6 . 9

Page 95: Spark手把手:[e2-spk-s02]

7 . 1

Page 102: Spark手把手:[e2-spk-s02]

7 . 3

Page 103: Spark手把手:[e2-spk-s02]

7 . 4

Page 104: Spark手把手:[e2-spk-s02]

7 . 4

Page 106: Spark手把手:[e2-spk-s02]

importorg.apache.spark.sql.SQLContextimportorg.apache.spark.sql.functions._valsc=newSparkContext(conf)// SparkSQL DataFrame, SQLContextvalsqlContext=newSQLContext(sc) // RDD DataFrameimportsqlContext.implicits._

7 . 4

Page 107: Spark手把手:[e2-spk-s02]

7 . 5

Page 108: Spark手把手:[e2-spk-s02]

7 . 5

Page 109: Spark手把手:[e2-spk-s02]

7 . 5

Page 111: Spark手把手:[e2-spk-s02]

// Parquet DataFramevaldf=sqlContext.read.parquet("people.parquet") // DataFrame stdout//DisplaysthecontentoftheDataFrametostdoutdf.show()

7 . 5

Page 112: Spark手把手:[e2-spk-s02]

7 . 6

Page 114: Spark手把手:[e2-spk-s02]

// Parquet DataFramevaldf=sqlContext.read.parquet("people.parquet") // DataFrame stdoutdf.show()// Schemadf.printSchema()// "name"df.select("name").show()// "age" +1df.select(df("name"),df("age")+1).show()// 21 peopledf.filter(df("age")>21).show()// age countdf.groupBy("age").count().show()

7 . 6

Page 115: Spark手把手:[e2-spk-s02]

7 . 7

Page 117: Spark手把手:[e2-spk-s02]

// Parquet DataFramevaldf=sqlContext.read.parquet("people.parquet")//df.registerTempTable("people")// DataFrame stdoutsqlContext.sql("SELECT*FROMpeople").show()// SchemasqlContext.sql("SELECT*FROMpeople").printSchema()// "name"sqlContext.sql("SELECTnameFROMpeople").show()// "age" +1sqlContext.sql("SELECTname,(age+1)asageFROMpeople").show()// 21 peoplesqlContext.sql("SELECT*FROMpeopleWHEREage>21").show()// age countsqlContext.sql("SELECTage,count(age)ascountFROMpeopleGroupByage").show()

7 . 7

Page 118: Spark手把手:[e2-spk-s02]

7 . 8

Page 119: Spark手把手:[e2-spk-s02]

7 . 8

Page 120: Spark手把手:[e2-spk-s02]

7 . 8

Page 122: Spark手把手:[e2-spk-s02]

// caseclass SchemacaseclassPerson(name:String,age:Int)// DataFramevaldf=sc.textFile("people.txt").map(_.split(",")).map(p=>Person(p(0),p(1).trim.toInt)).toDF() df.registerTempTable("people")valteenagers=sqlContext.sql("SELECTname,ageFROMpeopleWHEREage>=13ANDage<=19") //SQLquery DataFrame, normalRDD operationteenagers.map(t=>"Name:"+t(0)).collect().foreach(println)//teenagers.map(t=>"Name:"+t.getAs[String]("name")).collect().foreach(println) //row.getValueMap[T] Map[String,T]teenagers.map(_.getValuesMap[Any](List("name","age"))).collect().foreach(println)

7 . 8

Page 123: Spark手把手:[e2-spk-s02]

7 . 9

Page 124: Spark手把手:[e2-spk-s02]

7 . 9

Page 125: Spark手把手:[e2-spk-s02]

7 . 9

Page 127: Spark手把手:[e2-spk-s02]

// , "parquet" Sparkvaldf=sqlContext.read.load("users.parquet") // DataFrame "parquet"df.select("name","favorite_color").write.save("namesAndFavColors.parquet")

7 . 9

Page 128: Spark手把手:[e2-spk-s02]

// , "parquet" Sparkvaldf=sqlContext.read.load("users.parquet") // DataFrame "parquet"df.select("name","favorite_color").write.save("namesAndFavColors.parquet")

7 . 9

Page 129: Spark手把手:[e2-spk-s02]

7 . 10

Page 130: Spark手把手:[e2-spk-s02]

7 . 10

Page 131: Spark手把手:[e2-spk-s02]

7 . 10

Page 132: Spark手把手:[e2-spk-s02]

7 . 10

Page 133: Spark手把手:[e2-spk-s02]

7 . 10

Page 134: Spark手把手:[e2-spk-s02]

7 . 10

Page 135: Spark手把手:[e2-spk-s02]

7 . 10

Page 136: Spark手把手:[e2-spk-s02]

7 . 10

Page 138: Spark手把手:[e2-spk-s02]

// , "parquet" Sparkvaldf=sqlContext.read.format("json").load("people.json")// DataFrame "parquet"df.select("name","age").write.save("namesAndAges.parquet")

7 . 10

Page 139: Spark手把手:[e2-spk-s02]

7 . 11

Page 140: Spark手把手:[e2-spk-s02]

7 . 11

Page 141: Spark手把手:[e2-spk-s02]

7 . 11

Page 142: Spark手把手:[e2-spk-s02]

7 . 11

Page 143: Spark手把手:[e2-spk-s02]

7 . 11

Page 144: Spark手把手:[e2-spk-s02]

7 . 11

Page 145: Spark手把手:[e2-spk-s02]

7 . 11

Page 147: Spark手把手:[e2-spk-s02]

importorg.apache.spark.sql.SaveMode// , "parquet" Sparkvaldf=sqlContext.read.load("users.parquet") // DataFrame "parquet" ( SaveMode.Overwrite)df.select("name","favorite_color").write.mode(SaveMode.Overwrite).save("namesAndFavColors.parquet")

7 . 11

Page 148: Spark手把手:[e2-spk-s02]

8 . 1

Page 153: Spark手把手:[e2-spk-s02]

//definetheschemausingacaseclasscaseclassAuction(auctionid:String,bid:Float,bidtime:Float,bidder:String,bidderrate:// ebay auctionvalebayText=sc.textFile("ebay.csv")// Auctionvalebay=ebayText.map(_.split(",")).map(p=>Auction(p(0),p(1).toFloat,p(2).toFloat,p(3),p(// DataFramevalauction=ebay.toDF()auction.registerTempTable("auction")// ?valcount=auction.select("auctionid").distinct.countSystem.out.println(count)// (item)valresults=sqlContext.sql("SELECTauctionid,item,count(bid)asbid_countFROMauctionGROUPBYauctionid,item"results.show()// ( / / )valresults2=sqlContext.sql("SELECTauctionid,MAX(price)asprice_max,MIN(price)asprice_min,AVG(price)asprice_avgFROMauctionGROUPBYitem,auctionid"

8 . 4

Page 156: Spark手把手:[e2-spk-s02]

importcom.databricks.spark.csv// 3rdpartylibrary "CSV" Dataframevaldf=sqlContext.read.format("com.databricks.spark.csv").option("header","true")//Usefirstlineofallfilesasheader.option("inferSchema","true")//Automaticallyinferdatatypes.load("sfpd.csv")// Schemadf.printSchema// Distinct Categorydf.select("Category").distinct().collect().foreach(println)// temptabledf.registerTempTable("sfpd")//sqlContext.sql("SELECTdistinctCategoryFROMsfpd").collect().foreach(println)// Top10sqlContext.sql("SELECTResolution,count(Resolution)asrescountFROMsfpdgroupbyResolutionorderbyrescountdesclimit10"// Top10sqlContext.sql("SELECTCategory,count(Category)ascatcountFROMsfpdgroupbyCategoryorderbycatcountdesclimit10"

8 . 6

Page 157: Spark手把手:[e2-spk-s02]

9