{"type":"doc","content":[{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HBase brief introduction ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase It's a distributed one 、 Column oriented open source database . Based on the HDFS above .Hbase The source of my name is Hadoop database, namely Hadoop database .HBase The computing and storage capacity of depends on Hadoop colony .","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" It is between NoSql and RDBMS Between , Only through the primary key (row key) And the primary key range To retrieve data , Only one line transactions... Are supported ( It can be done by Hive Support to implement multiple tables join And so on ).","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase The characteristics of Chinese table :","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":" Big : A table can have billions of rows , Millions of column ","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":" For the column : For the column ( family ) Storage and rights control for , Column ( family ) Independent search .","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":" sparse :","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" For empty (null) The column of , It doesn't take up storage space , therefore , Tables can be designed very sparsely ","attrs":{}},{"type":"text","text":".","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HBase Underlying principle ","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":" System architecture ","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/64/64b66b06b6790d89a8e0eb9a8e933f68.png","alt":"HBase System architecture ","title":"HBase System architecture ","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase System architecture ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" According to this picture , Explain HBase The components of ","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"Client","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":" Include access hbase The interface of ,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Client Some of them are being maintained cache To speed up hbase The interview of ","attrs":{}},{"type":"text","text":", such as regione Location information for .","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"Zookeeper","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase You can use the built-in Zookeeper, You can also use external , In the actual production environment , To maintain unity , Generally use external Zookeeper.","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Zookeeper stay HBase The role of :","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":" Guarantee any time , Only one in the cluster master","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":" Store all Region Address entry for ","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":" Real-time monitoring Region Server The state of , take Region server Real time notification of online and offline information to Master","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"HMaster","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":" by Region server Distribute region","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" be responsible for region server Load balancing of ","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":" Found inoperative region server And reallocate the region","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":4,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS Garbage collection on ","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":5,"align":null,"origin":null},"content":[{"type":"text","text":" Handle schema Update request ","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"HRegion Server","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HRegion server","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" maintain HMaster Assigned to it region","attrs":{}},{"type":"text","text":", Deal with these region Of IO request ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HRegion server Responsible for segmentation that becomes too large during operation region","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" As you can see from the diagram ,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Client visit HBase There is no need for HMaster Participate in ","attrs":{}},{"type":"text","text":"( Address access Zookeeper and HRegion server, Data read and write access HRegione server)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"HMaster Just the maintainer table and HRegion Metadata information , Very low load .","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"HBase Table data model for ","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/a5/a5ea27dd8097742bcdcff5f8bf0cc860.png","alt":"HBase The table structure ","title":"HBase The table structure ","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase The table structure ","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":" The line of key Row Key","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" And nosql The database is the same ,row key Is the primary key used to retrieve the record . visit hbase table The lines in the , There are only three ways :","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":" Through a single row key visit ","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":" adopt row key Of range","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":" Full table scan ","attrs":{}}]}],"attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Row Key The line key can be any string (","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" The maximum length is 64KB","attrs":{}},{"type":"text","text":", In practice, the length is generally 10-100bytes), stay hbase Inside ,row key Save as an array of bytes .","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Hbase The data in the table will be processed according to rowkey Sort ( Dictionary order )","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" When the storage , Data according to Row key Dictionary sequence (byte order) Sorting storage . Design key when , To fully sort and store this feature , Stores rows that are often read together .( Positional correlation ).","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" Be careful :","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" The dictionary order is right int The result of the sorting is ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"1,10,100,11,12,13,14,15,16,17,18,19,2,20,21 … .","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" To maintain the natural order of plastic surgery , The line key must be 0 Fill left .","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" One read and write of a line is an atomic operation ( No matter how many columns you read or write at a time )","attrs":{}},{"type":"text","text":". This design decision can make it easy for users to understand the behavior of the program in Concurrent update operations on the same line .","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":" Column family Column Family","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"HBase Each column in the table , They all belong to a family ","attrs":{}},{"type":"text","text":". A column family is a table schema Part of ( The column is not ),","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" You must define the table before using it ","attrs":{}},{"type":"text","text":".","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" Column names are prefixed by column families . for example courses:history , courses:math All belong to courses This column family .","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" Access control 、 Disk and memory usage statistics are conducted at the column family level . The more families , To be involved in fetching a row of data IO、 The more documents we search for , therefore , If not necessary , Don't set too many column families .","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":" Column Column","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" Specific columns below column families , Belong to one of ColumnFamily, Similar to in mysql The specific columns created in .","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":" Time stamp Timestamp","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase Pass through row and columns Defined as a storage unit is called cell. Every cell All keep multiple versions of the same data . Versions are indexed by timestamps . The timestamp type is 64 An integer .","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" The timestamp can be determined by hbase( Automatically when data is written ) assignment ","attrs":{}},{"type":"text","text":", The timestamp is the current system time accurate to milliseconds . Timestamps can also be assigned explicitly by the client . If the application is to avoid data version conflicts , You must generate your own unique timestamp .","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":" Every cell in , Different versions of the data are sorted in reverse chronological order ","attrs":{}},{"type":"text","text":", That is, the latest data is at the top of the list .","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" To avoid management caused by having too many versions of the data ( Includes storage and indexes ) burden ,hbase Two methods of data version recovery are provided :","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":" Save the last of the data n A version ","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":" Save the latest version ( Set the life cycle of the data TTL).","attrs":{}}]}],"attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" Users can set this for each column family .","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":" unit Cell","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" from {row key, column( = +