From 9d19afc3ced643537553cfcb6708929ffaa78382 Mon Sep 17 00:00:00 2001 From: xjxia Date: Tue, 5 Nov 2024 17:53:27 +0800 Subject: [PATCH] spider support primary tdbctl's fail-over --- .../dbmodule/dbmysql/MySQLBackend_switch.go | 2 +- .../dbmodule/dbmysql/MySQL_common_switch.go | 347 ++++++++++++------ .../dbmysql/SpiderProxyLayer_switch.go | 120 +++--- .../dbmysql/SpiderStorageLayer_switch.go | 20 +- 4 files changed, 320 insertions(+), 169 deletions(-) diff --git a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQLBackend_switch.go b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQLBackend_switch.go index 2c778f6677..5428f53d37 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQLBackend_switch.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQLBackend_switch.go @@ -106,7 +106,7 @@ func (ins *MySQLSwitch) DoSwitch() error { ins.ReportLogs(constvar.InfoResult, "all proxy flush backends to 1.1.1.1 success") ins.ReportLogs(constvar.InfoResult, "try to reset slave") - binlogFile, binlogPosition, err := ins.ResetSlave() + binlogFile, binlogPosition, err := ins.ResetSlaveExtend(ins.StandBySlave.Ip, ins.StandBySlave.Port) if err != nil { ins.ReportLogs(constvar.FailResult, fmt.Sprintf("reset slave failed:%s", err.Error())) return fmt.Errorf("reset slave failed") diff --git a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go index 26d2d30289..a743d15f6b 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go @@ -30,6 +30,29 @@ import ( "gorm.io/gorm" ) +//node role type in information_schema.TDBCTL_NODES +const ( + // PrimaryRole SHOW SLAVE STATUS no replication info and tc_is_primary is 1 + PrimaryRole = "primary" + // SecondaryRole SHOW SLAVE STATUS with replication info and tc_is_primary is 0 + SecondaryRole = "Secondary" + // StandaloneRole SHOW SLAVE STATUS without replication info and tc_is_primary is 0 + StandaloneRole = "Standalone" + // FalsePrimaryRole SHOW SLAVE STATUS with replication info and tc_is_primary is 1 + FalsePrimaryRole = "FalsePrimary" + // UnknownROle no tc_is_primary variable found + UnknownROle = "Unknown" +) + +//node status in information_schema.TDBCTL_NODES +const ( + StatusOnline = "Online" + StatusOffline = "Offline " + StatusUnreachable = "Unreachable" + StatusErr = "Error" +) + +//command on TDBCTL Node Server const ( // GetPrimarySQL sql to get primary tdbctl GetPrimarySQL = "TDBCTL GET PRIMARY" @@ -76,10 +99,16 @@ type MySQLCommonSwitchUtil interface { // TenDBCluster special specify, spider/remote usual include this type SpiderCommonSwitch struct { MySQLCommonSwitch - //primary tdbctl info + //cluster domain in dbmeta + ClusterName string + //all spider instances in dbmeta + SpiderNodes []dbutil.DBInstanceInfoDetail + //route table in tdbctl node + RouteTable []RouteInfo + //primary tdbctl info, if primary node broken-down, may unusable PrimaryTdbctl *TdbctlInfo - ClusterName string - RouteTable []RouteInfo + //only primary broken-down and elect success, this should be non-nil + NewPrimaryTdbctl *TdbctlInfo } // DelayInfo defined slave delay info @@ -110,61 +139,61 @@ type BinlogStatus struct { // SlaveStatus show slave status info struct type SlaveStatus struct { - SlaveIoState string `gorm:"column:Slave_IO_State"` - MasterHost string `gorm:"column:Master_Host"` - MasterUser string `gorm:"column:Master_User"` - MasterPort int `gorm:"column:Master_Port"` - ConnectRetry int `gorm:"column:Connect_Retry"` - MasterLogFile string `gorm:"column:Master_Log_File"` - ReadMasterLogPos uint64 `gorm:"column:Read_Master_Log_Pos"` - RelayLogFile string `gorm:"column:Relay_Log_File"` - RelayLogPos uint64 `gorm:"column:Relay_Log_Pos"` - RelayMasterLogFile string `gorm:"column:Relay_Master_Log_File"` - SlaveIoRunning string `gorm:"column:Slave_IO_Running"` - SlaveSqlRunning string `gorm:"column:Slave_SQL_Running"` - ReplicateDoDb string `gorm:"column:Replicate_Do_DB"` - ReplicateIgnoreDb string `gorm:"column:Replicate_Ignore_DB"` - ReplicateDoTable string `gorm:"column:Replicate_Do_Table"` - ReplicateIgnoreTable string `gorm:"column:Replicate_Ignore_Table"` - ReplicateWildDoTable string `gorm:"column:Replicate_Wild_Do_Table"` - ReplicateWildIgnoreTable string `gorm:"column:Replicate_Wild_Ignore_Table"` - LastErrno int `gorm:"column:Last_Errno"` - LastError string `gorm:"column:Last_Error"` - SkipCounter int `gorm:"column:Skip_Counter"` - ExecMasterLogPos uint64 `gorm:"column:Exec_Master_Log_Pos"` - RelayLogSpace uint64 `gorm:"column:Relay_Log_Space"` - UntilCondition string `gorm:"column:Until_Condition"` - UntilLogFile string `gorm:"column:Until_Log_File"` - UntilLogPos uint64 `gorm:"column:Until_Log_Pos"` - MasterSslAllowed string `gorm:"column:Master_SSL_Allowed"` - MasterSslCaFile string `gorm:"column:Master_SSL_CA_File"` - MasterSslCaPath string `gorm:"column:Master_SSL_CA_Path"` - MasterSslCert string `gorm:"column:Master_SSL_Cert"` - MasterSslCipher string `gorm:"column:Master_SSL_Cipher"` - MasterSslKey string `gorm:"column:Master_SSL_Key"` - SecondsBehindMaster int `gorm:"column:Seconds_Behind_Master"` - MasterSslVerifyServerCert string `gorm:"column:Master_SSL_Verify_Server_Cert"` - LastIoErrno int `gorm:"column:Last_IO_Errno"` - LastIoError string `gorm:"column:Last_IO_Error"` - LastSqlErrno int `gorm:"column:Last_SQL_Errno"` - LastSqlError string `gorm:"column:Last_SQL_Error"` - ReplicateIgnoreServerIds string `gorm:"column:Replicate_Ignore_Server_Ids"` - MasterServerId uint64 `gorm:"column:Master_Server_Id"` - MasterUuid string `gorm:"column:Master_UUID"` - MasterInfoFile string `gorm:"column:Master_Info_File"` - SqlDelay uint64 `gorm:"column:SQL_Delay"` - SqlRemainingDelay string `gorm:"column:SQL_Remaining_Delay"` - SlaveSqlRunningState string `gorm:"column:Slave_SQL_Running_State"` - MasterRetryCount int `gorm:"column:Master_Retry_Count"` - MasterBind string `gorm:"column:Master_Bind"` - LastIoErrorTimestamp string `gorm:"column:Last_IO_Error_Timestamp"` - LastSqlErrorTimestamp string `gorm:"column:Last_SQL_Error_Timestamp"` - MasterSslCrl string `gorm:"column:Master_SSL_Crl"` - MasterSslCrlpath string `gorm:"column:Master_SSL_Crlpath"` - RetrievedGtidSet string `gorm:"column:Retrieved_Gtid_Set"` - ExecutedGtidSet string `gorm:"column:Executed_Gtid_Set"` - AutoPosition string `gorm:"column:Auto_Position"` - ReplicateWildParallelTable string `gorm:"column:Replicate_Wild_Parallel_Table"` + SlaveIoState string `gorm:"column:Slave_IO_State" json:"Slave_IO_State"` + MasterHost string `gorm:"column:Master_Host" json:"Master_Host"` + MasterUser string `gorm:"column:Master_User" json:"Master_User"` + MasterPort int `gorm:"column:Master_Port" json:"Master_Port"` + ConnectRetry int `gorm:"column:Connect_Retry" json:"Connect_Retry"` + MasterLogFile string `gorm:"column:Master_Log_File" json:"Master_Log_File"` + ReadMasterLogPos uint64 `gorm:"column:Read_Master_Log_Pos" json:"Read_Master_Log_Pos"` + RelayLogFile string `gorm:"column:Relay_Log_File" json:"Relay_Log_File"` + RelayLogPos uint64 `gorm:"column:Relay_Log_Pos" json:"Relay_Log_Pos"` + RelayMasterLogFile string `gorm:"column:Relay_Master_Log_File" json:"Relay_Master_Log_File"` + SlaveIoRunning string `gorm:"column:Slave_IO_Running" json:"Slave_IO_Running"` + SlaveSqlRunning string `gorm:"column:Slave_SQL_Running" json:"Slave_SQL_Running"` + ReplicateDoDb string `gorm:"column:Replicate_Do_DB" json:"Replicate_Do_DB"` + ReplicateIgnoreDb string `gorm:"column:Replicate_Ignore_DB" json:"Replicate_Ignore_DB"` + ReplicateDoTable string `gorm:"column:Replicate_Do_Table" json:"Replicate_Do_Table"` + ReplicateIgnoreTable string `gorm:"column:Replicate_Ignore_Table" json:"Replicate_Ignore_Table"` + ReplicateWildDoTable string `gorm:"column:Replicate_Wild_Do_Table" json:"Replicate_Wild_Do_Table"` + ReplicateWildIgnoreTable string `gorm:"column:Replicate_Wild_Ignore_Table" json:"Replicate_Wild_Ignore_Table"` + LastErrno int `gorm:"column:Last_Errno" json:"Last_Errno"` + LastError string `gorm:"column:Last_Error" json:"Last_Error"` + SkipCounter int `gorm:"column:Skip_Counter" json:"Skip_Counter"` + ExecMasterLogPos uint64 `gorm:"column:Exec_Master_Log_Pos" json:"Exec_Master_Log_Pos"` + RelayLogSpace uint64 `gorm:"column:Relay_Log_Space" json:"Relay_Log_Space"` + UntilCondition string `gorm:"column:Until_Condition" json:"Until_Condition"` + UntilLogFile string `gorm:"column:Until_Log_File" json:"Until_Log_File"` + UntilLogPos uint64 `gorm:"column:Until_Log_Pos" json:"Until_Log_Pos"` + MasterSslAllowed string `gorm:"column:Master_SSL_Allowed" json:"Master_SSL_Allowed"` + MasterSslCaFile string `gorm:"column:Master_SSL_CA_File" json:"Master_SSL_CA_File"` + MasterSslCaPath string `gorm:"column:Master_SSL_CA_Path" json:"Master_SSL_CA_Path"` + MasterSslCert string `gorm:"column:Master_SSL_Cert" json:"Master_SSL_Cert"` + MasterSslCipher string `gorm:"column:Master_SSL_Cipher" json:"Master_SSL_Cipher"` + MasterSslKey string `gorm:"column:Master_SSL_Key" json:"Master_SSL_Key"` + SecondsBehindMaster int `gorm:"column:Seconds_Behind_Master" json:"Seconds_Behind_Master"` + MasterSslVerifyServerCert string `gorm:"column:Master_SSL_Verify_Server_Cert" json:"Master_SSL_Verify_Server_Cert"` + LastIoErrno int `gorm:"column:Last_IO_Errno" json:"Last_IO_Errno"` + LastIoError string `gorm:"column:Last_IO_Error" json:"Last_IO_Error"` + LastSqlErrno int `gorm:"column:Last_SQL_Errno" json:"Last_SQL_Errno"` + LastSqlError string `gorm:"column:Last_SQL_Error" json:"Last_SQL_Error"` + ReplicateIgnoreServerIds string `gorm:"column:Replicate_Ignore_Server_Ids" json:"Replicate_Ignore_Server_Ids"` + MasterServerId uint64 `gorm:"column:Master_Server_Id" json:"Master_Server_Id"` + MasterUuid string `gorm:"column:Master_UUID" json:"Master_UUID"` + MasterInfoFile string `gorm:"column:Master_Info_File" json:"Master_Info_File"` + SqlDelay uint64 `gorm:"column:SQL_Delay" json:"SQL_Delay"` + SqlRemainingDelay string `gorm:"column:SQL_Remaining_Delay" json:"SQL_Remaining_Delay"` + SlaveSqlRunningState string `gorm:"column:Slave_SQL_Running_State" json:"Slave_SQL_Running_State"` + MasterRetryCount int `gorm:"column:Master_Retry_Count" json:"Master_Retry_Count"` + MasterBind string `gorm:"column:Master_Bind" json:"Master_Bind"` + LastIoErrorTimestamp string `gorm:"column:Last_IO_Error_Timestamp" json:"Last_IO_Error_Timestamp"` + LastSqlErrorTimestamp string `gorm:"column:Last_SQL_Error_Timestamp" json:"Last_SQL_Error_Timestamp"` + MasterSslCrl string `gorm:"column:Master_SSL_Crl" json:"Master_SSL_Crl"` + MasterSslCrlpath string `gorm:"column:Master_SSL_Crlpath" json:"Master_SSL_Crlpath"` + RetrievedGtidSet string `gorm:"column:Retrieved_Gtid_Set" json:"Retrieved_Gtid_Set"` + ExecutedGtidSet string `gorm:"column:Executed_Gtid_Set" json:"Executed_Gtid_Set"` + AutoPosition string `gorm:"column:Auto_Position" json:"Auto_Position"` + ReplicateWildParallelTable string `gorm:"column:Replicate_Wild_Parallel_Table" json:"Replicate_Wild_Parallel_Table"` } // RouteInfo route in mysql.servers @@ -182,7 +211,9 @@ type TdbctlInfo struct { ServerName string `gorm:"column:SERVER_NAME"` Host string `gorm:"column:HOST"` Port int `gorm:"column:PORT"` - //if 1, indicate this server is primary + /* if 1, indicate this server is primary + only primary node broken-down trigger elect + */ CurrentServer int `gorm:"column:IS_THIS_SERVER"` } @@ -596,10 +627,8 @@ func (ins *MySQLCommonSwitch) ConnectInstance(host string, port int) (*sql.DB, e return dbutil.ConnMySQL(connParam) } -// ResetSlave do reset slave -func (ins *MySQLCommonSwitch) ResetSlave() (string, uint64, error) { - slaveIp := ins.StandBySlave.Ip - slavePort := ins.StandBySlave.Port +// ResetSlaveExtend do reset slave and save consistent binlog info +func (ins *MySQLCommonSwitch) ResetSlaveExtend(slaveIp string, slavePort int) (string, uint64, error) { user := ins.Config.DBConf.MySQL.User pass := ins.Config.DBConf.MySQL.Pass log.Logger.Infof("gonna RESET SLAVE on %s:%d", slaveIp, slavePort) @@ -612,6 +641,11 @@ func (ins *MySQLCommonSwitch) ResetSlave() (string, uint64, error) { log.Logger.Errorf("open mysql failed. ip:%s, port:%d, err:%s", slaveIp, slavePort, err.Error()) return "", 0, err } + defer func() { + if conn, err := db.DB(); err != nil { + _ = conn.Close() + } + }() stopSql := "stop slave" masterSql := "show master status" @@ -642,6 +676,57 @@ func (ins *MySQLCommonSwitch) ResetSlave() (string, uint64, error) { return masterStatus.File, masterStatus.Position, nil } +// ChangeMasterAuto do reset slave and save consistent binlog info +func (ins *MySQLCommonSwitch) ChangeMasterAuto(slaveIp string, slavePort int, changeSql string) error { + user := ins.Config.DBConf.MySQL.User + pass := ins.Config.DBConf.MySQL.Pass + + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("try to connect node:%s#%d", slaveIp, slavePort)) + connParam := fmt.Sprintf("%s:%s@(%s:%d)/%s", user, pass, slaveIp, slavePort, "infodba_schema") + db, err := gorm.Open(mysql.Open(connParam), &gorm.Config{ + Logger: log.GormLogger, + }) + if err != nil { + log.Logger.Errorf("open mysql failed. ip:%s, port:%d, err:%s", slaveIp, slavePort, err.Error()) + return err + } + defer func() { + if conn, err := db.DB(); err != nil { + _ = conn.Close() + } + }() + + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("node:%s#%d do stop slave", slaveIp, slavePort)) + err = db.Exec("stop slave").Error + if err != nil { + return fmt.Errorf("stop slave failed. err:%s", err.Error()) + } + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("node:%s#%d do stop slave success", slaveIp, slavePort)) + + var slaveStatus SlaveStatus + err = db.Raw("show slave status").Scan(&slaveStatus).Error + if err != nil { + return fmt.Errorf("show master status failed, err:%s", err.Error()) + } + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("before change to new master, binlog_file:%s, "+ + "binlog_pos:%d", slaveStatus.RelayMasterLogFile, slaveStatus.ExecMasterLogPos)) + + err = db.Exec(changeSql).Error + if err != nil { + return fmt.Errorf("do change master sql failed:%s", err.Error()) + } + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("node:%s#%d do CHANGE SQL success", slaveIp, slavePort)) + + err = db.Exec("start slave").Error + if err != nil { + return fmt.Errorf("do START SLAVE failed:%s", err.Error()) + } + + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("node:%s#%d execute START SLAVE success", slaveIp, slavePort)) + + return nil +} + // UpdateMetaInfo swap master, slave 's meta info in cmdb func (ins *MySQLCommonSwitch) UpdateMetaInfo() error { return nil @@ -680,9 +765,9 @@ func (ins *SpiderCommonSwitch) QueryRouteInfo(db *sql.DB) ([]RouteInfo, error) { return routeTable, nil } -// QueryNodeInfo query nodes info from information_schema.TDBCTL_NODES -func (ins *SpiderCommonSwitch) QueryNodeInfo(db *sql.DB) ([]TdbctlNodes, error) { - nodesInfo := make([]TdbctlNodes, 0) +// QueryNodesInfo query nodes info from information_schema.TDBCTL_NODES +func (ins *SpiderCommonSwitch) QueryNodesInfo(db *sql.DB) (map[string]TdbctlNodes, error) { + nodesInfo := make(map[string]TdbctlNodes, 0) ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("try to execute sql[%s]", GetNodeSQL)) rows, err := db.Query(GetNodeSQL) if err != nil { @@ -695,7 +780,7 @@ func (ins *SpiderCommonSwitch) QueryNodeInfo(db *sql.DB) ([]TdbctlNodes, error) &node.Message, &node.ReplicationInfo); err != nil { return nil, fmt.Errorf("query tdbctl_nodes failed:%s", err.Error()) } - nodesInfo = append(nodesInfo, node) + nodesInfo[node.ServerName] = node } if len(nodesInfo) == 0 { return nil, fmt.Errorf("no node info found") @@ -706,17 +791,6 @@ func (ins *SpiderCommonSwitch) QueryNodeInfo(db *sql.DB) ([]TdbctlNodes, error) return nodesInfo, nil } -func (ins *SpiderCommonSwitch) ConnectPrimaryTdbctl() (*sql.DB, error) { - if ins.PrimaryTdbctl == nil { - return nil, fmt.Errorf("none primary node set") - } - primaryConn, err := ins.ConnectInstance(ins.PrimaryTdbctl.Host, ins.PrimaryTdbctl.Port) - if err != nil { - return nil, fmt.Errorf("connect primary tdbctl failed:%s", err.Error()) - } - return primaryConn, nil -} - // RemoveNodeFromRoute connect primary node and remove input node's route func (ins *SpiderCommonSwitch) RemoveNodeFromRoute(primaryConn *sql.DB, host string, port int) error { routeInfo := ins.GetNodeRoute(host, port) @@ -733,21 +807,15 @@ func (ins *SpiderCommonSwitch) RemoveNodeFromRoute(primaryConn *sql.DB, host str return nil } -func (ins *SpiderCommonSwitch) SetPrimary(node *TdbctlInfo) { - ins.PrimaryTdbctl = node -} - -// GetPrimary found primary node from all spiders -// if primary node down or no primary found, return error -// 1) ER_TCADMIN_GET_PRIMARY: primary broken-down -// 2) other error, get primary failed -func (ins *SpiderCommonSwitch) GetPrimary() (*TdbctlInfo, error) { - primaryTdbctl := &TdbctlInfo{} - allSpiders, err := ins.GetAllSpiders() - if err != nil { - return nil, err - } - for _, spider := range allSpiders { +// GetPrimary found primary node from any connected tdbctl node's route table +// If no primary found, return error. +// Any blow condition could get primary success +// 1) There is only one node: PrimaryRole, StatusOnline +// 2) No primary role found, and all alive SecondaryRole node's ReplicationMaster are the same, +// then thought the ReplicationMaster must be the Primary node's ServerName +func (ins *SpiderCommonSwitch) GetPrimary() error { + replicaServer := "" + for _, spider := range ins.SpiderNodes { //only spider-master had tdbctl node, and should connect use admin port if spider.Status == constvar.UNAVAILABLE || spider.SpiderRole == constvar.TenDBClusterProxySlave { @@ -767,52 +835,95 @@ func (ins *SpiderCommonSwitch) GetPrimary() (*TdbctlInfo, error) { spider.IP, spider.AdminPort)) //TODO: gorm bug? must use sql.DB instead here //get primary tdbctl from connected tdbctl - if err := currentConn.QueryRow(GetPrimarySQL).Scan(primaryTdbctl.ServerName, - primaryTdbctl.Host, primaryTdbctl.Port, primaryTdbctl.CurrentServer); err != nil { - ins.ReportLogs(constvar.WarnResult, fmt.Sprintf("execute [%s] failed:%s", - GetPrimarySQL, err.Error())) - _ = currentConn.Close() - if dbutil.GetMySQLErrorCode(err) == ER_TCADMIN_GET_PRIMARY { - ins.ReportLogs(constvar.WarnResult, fmt.Sprintf("errcode is %d, primary node broken-down", - ER_TCADMIN_GET_PRIMARY)) - return nil, nil - } else { - ins.ReportLogs(constvar.WarnResult, "get primary failed, try other nodes") - continue + + nodeMaps, err := ins.QueryNodesInfo(currentConn) + if err != nil { + ins.ReportLogs(constvar.WarnResult, "get all tdbctl node info failed, try other nodes") + } else { + for _, node := range nodeMaps { + ins.ReportLogs(constvar.InfoResult, + fmt.Sprintf("try to check node:%s", util.GraceStructString(node))) + if strings.EqualFold(node.Status, StatusOnline) && + strings.EqualFold(node.ClusterRole, PrimaryRole) { + if ins.PrimaryTdbctl != nil { + ins.ReportLogs(constvar.FailResult, fmt.Sprintf("multi primary node [%s#%d] and [%s#%d] found", + ins.PrimaryTdbctl.Host, ins.PrimaryTdbctl.Port, node.Host, node.Port)) + return fmt.Errorf("multi primary node found") + } + ins.PrimaryTdbctl = &TdbctlInfo{ + ServerName: node.ServerName, + Host: node.Host, + Port: node.Port, + CurrentServer: 0, + } + if ins.Ip == node.Host { + return fmt.Errorf("broken-down node is primary, but its status is %s", StatusOnline) + } + } + if strings.EqualFold(node.ClusterRole, SecondaryRole) { + if replicaServer == "" { + replicaServer = node.ReplicationMaster + } + if replicaServer != "" && replicaServer != node.ReplicationMaster { + ins.ReportLogs(constvar.FailResult, fmt.Sprintf("multi ReplicationMaster found[%s,%s]", + node.ReplicationMaster, node.ServerName)) + return fmt.Errorf("multi ReplicationMaster found") + } + } + } + //after scan all nodes in route table, if no primary node found, + //maybe primary node broken-down. And the replicaServer must be the primary + //node's ServerName, choose it + if replicaServer != "" && ins.PrimaryTdbctl == nil { + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("primary node's server name is %s", + replicaServer)) + primaryNode := nodeMaps[replicaServer] + ins.PrimaryTdbctl = &TdbctlInfo{ + ServerName: primaryNode.ServerName, + Host: primaryNode.Host, + Port: primaryNode.Port, + CurrentServer: 0, + } + if primaryNode.Host == ins.Ip { + ins.ReportLogs(constvar.InfoResult, "current broken-down node is primary") + ins.PrimaryTdbctl.CurrentServer = 1 + } } } - - ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("get primary tdbctl success, primary info:%s#%d", - primaryTdbctl.Host, primaryTdbctl.Port)) - _ = currentConn.Close() - return primaryTdbctl, nil } } - return nil, fmt.Errorf("no appropriate primary tdbctl found") + if ins.PrimaryTdbctl == nil { + return fmt.Errorf("no primary node found") + } + + ins.ReportLogs(constvar.WarnResult, fmt.Sprintf("get primary node[%s#%d] success", + ins.PrimaryTdbctl.Host, ins.PrimaryTdbctl.Port)) + + return nil } -func (ins *SpiderCommonSwitch) GetAllSpiders() ([]dbutil.DBInstanceInfoDetail, error) { - allSpiders := []dbutil.DBInstanceInfoDetail{} +// SetSpiderNodes get all spider nodes from dbmeta +func (ins *SpiderCommonSwitch) SetSpiderNodes() error { cmdbClient := client.NewCmDBClient(&ins.Config.DBConf.CMDB, ins.Config.GetCloudId()) rawData, err := cmdbClient.GetDBInstanceInfoByCluster(ins.ClusterName) if err != nil { - return nil, fmt.Errorf("get all cluster instance info failed:%s", err.Error()) + return fmt.Errorf("get all cluster instance info failed:%s", err.Error()) } for _, v := range rawData { cmdbIns := dbutil.DBInstanceInfoDetail{} rawIns, jsonErr := json.Marshal(v) if jsonErr != nil { - return nil, fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) + return fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) } if jsonErr = json.Unmarshal(rawIns, &cmdbIns); jsonErr != nil { - return nil, fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) + return fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) } if cmdbIns.MachineType == constvar.TenDBClusterProxyType { - allSpiders = append(allSpiders, cmdbIns) + ins.SpiderNodes = append(ins.SpiderNodes, cmdbIns) } } - return allSpiders, nil + return nil } diff --git a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderProxyLayer_switch.go b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderProxyLayer_switch.go index eb4caca488..80b0f1d3f1 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderProxyLayer_switch.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderProxyLayer_switch.go @@ -23,25 +23,6 @@ import ( "strings" ) -//cluster role type in information_schema.TDBCTL_NODES -const ( - // PrimaryRole SHOW SLAVE STATUS no replication info and tc_is_primary is 1 - PrimaryRole = "primary" - // SecondaryRole SHOW SLAVE STATUS with replication info and tc_is_primary is 0 - SecondaryRole = "Secondary" - // StandaloneRole SHOW SLAVE STATUS without replication info and tc_is_primary is 0 - StandaloneRole = "Standalone" - // FalsePrimaryRole SHOW SLAVE STATUS with replication info and tc_is_primary is 1 - FalsePrimaryRole = "FalsePrimary" - // UnknownROle no tc_is_primary variable found - UnknownROle = "Unknown" -) - -const ( - // ER_TCADMIN_GET_PRIMARY get primary failed - ER_TCADMIN_GET_PRIMARY = 12049 -) - // SpiderProxyLayerSwitch spider node switch type SpiderProxyLayerSwitch struct { SpiderCommonSwitch @@ -49,6 +30,8 @@ type SpiderProxyLayerSwitch struct { AdminPort int //storage layer instance used Entry dbutil.BindEntry + //temporary secondary node, after elect new primary, need to CHANGE MASTER TO + SecondaryNodes []TdbctlNodes } // EnablePrimary connect candidate node and execute TDBCTL ENABLE PRIMARY FORCE @@ -77,11 +60,6 @@ func (ins *SpiderProxyLayerSwitch) EnablePrimary(rawPrimaryNode *TdbctlInfo) err } func (ins *SpiderProxyLayerSwitch) ElectPrimaryCandidate() (*TdbctlInfo, error) { - allSpiders, err := ins.GetAllSpiders() - if err != nil { - return nil, err - } - getLogFileIndex := func(logFile string) (int, error) { parts := strings.Split(logFile, ".") if len(parts) < 2 { @@ -91,14 +69,14 @@ func (ins *SpiderProxyLayerSwitch) ElectPrimaryCandidate() (*TdbctlInfo, error) } var electNode *TdbctlInfo - var nodes []TdbctlNodes + var nodes map[string]TdbctlNodes maxRelayIndex := -1 maxExecPos := uint64(0) oldPrimaryName := "" //found any node and get nodes info from TDBCTL_NODES ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("try to connect any alived tdbctl node and get nodes info")) - for _, spider := range allSpiders { + for _, spider := range ins.SpiderNodes { //only spider-master had tdbctl node, and should connect use admin port if spider.Status == constvar.UNAVAILABLE || spider.SpiderRole == constvar.TenDBClusterProxySlave { @@ -120,7 +98,7 @@ func (ins *SpiderProxyLayerSwitch) ElectPrimaryCandidate() (*TdbctlInfo, error) //try to get nodes info var err error - nodes, err = ins.QueryNodeInfo(currentConn) + nodes, err = ins.QueryNodesInfo(currentConn) _ = currentConn.Close() if err != nil { ins.ReportLogs(constvar.InfoResult, fmt.Sprintf(" failed:%s, try others", err.Error())) @@ -136,9 +114,14 @@ func (ins *SpiderProxyLayerSwitch) ElectPrimaryCandidate() (*TdbctlInfo, error) return nil, fmt.Errorf("failed to retrieve any nodes information") } - ins.ReportLogs(constvar.InfoResult, "try to elect a appropriate node as primary") + ins.ReportLogs(constvar.InfoResult, "try to elect an appropriate node as primary") for _, node := range nodes { //1. clusterRole must be Secondary + //should not happen + if strings.EqualFold(node.ClusterRole, PrimaryRole) { + return nil, fmt.Errorf("[bug]node[%s#%d]'s clusterRole[%s] is primary, can not happen", + node.Host, node.Port, node.ClusterRole) + } if !strings.EqualFold(node.ClusterRole, SecondaryRole) { ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("node[%s#%d]'s clusterRole[%s] is not secondary, skip", node.Host, node.Port, node.ClusterRole)) @@ -168,7 +151,9 @@ func (ins *SpiderProxyLayerSwitch) ElectPrimaryCandidate() (*TdbctlInfo, error) "try other nodes", node.Host, node.Port, err.Error())) continue } - if !strings.EqualFold(replInfo.SlaveSqlRunning, "YES") { + log.Logger.Debugf("node[%s#%d]'s REPLICATION_INFO:%s", + node.Host, node.Port, util.GraceStructString(replInfo)) + if !strings.EqualFold(replInfo.SlaveSqlRunning, "Yes") { ins.ReportLogs(constvar.WarnResult, fmt.Sprintf("node[%s#%d]'s sql_thread not Yes, try other nodes", node.Host, node.Port)) continue @@ -182,6 +167,8 @@ func (ins *SpiderProxyLayerSwitch) ElectPrimaryCandidate() (*TdbctlInfo, error) node.Host, node.Port, err.Error())) continue } + //add node to array and repair new replication later + ins.SecondaryNodes = append(ins.SecondaryNodes, node) if relayIndex > maxRelayIndex || (relayIndex == maxRelayIndex && replInfo.ExecMasterLogPos > maxExecPos) { @@ -214,48 +201,66 @@ func (ins *SpiderProxyLayerSwitch) CheckSwitch() (bool, error) { // 3. remove broken-down node from primary-tdbctl route table // 4. primary-tdbctl do flush routing func (ins *SpiderProxyLayerSwitch) DoSwitch() error { - //1. update name service + var ( + primaryHost string + primaryPort int + ) + + //1. delete name service ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("try to release ip[%s#%d] from all domain entry", ins.Ip, ins.Port)) if err := ins.DeleteNameService(ins.Entry); err != nil { return err } + //2. set all spider nodes + if err := ins.SetSpiderNodes(); err != nil { + return err + } + //2. try to get primary node - primaryNode, err := ins.GetPrimary() - if err != nil { + if err := ins.GetPrimary(); err != nil { ins.ReportLogs(constvar.FailResult, "get primary node failed") return err } - if primaryNode == nil { - //2.1 no primary node found, elect one - ins.ReportLogs(constvar.InfoResult, "no appropriate primary found, try to elect one") - primaryNode, err = ins.ElectPrimaryCandidate() + //3. primary node broken-down, try to elect one + if ins.PrimaryTdbctl.CurrentServer == 1 { + ins.ReportLogs(constvar.InfoResult, "primary node broken-down, try to elect one") + + newPrimaryNode, err := ins.ElectPrimaryCandidate() if err != nil { ins.ReportLogs(constvar.FailResult, "elect primary node failed") return err } - if err = ins.EnablePrimary(primaryNode); err != nil { + if err = ins.EnablePrimary(newPrimaryNode); err != nil { ins.ReportLogs(constvar.FailResult, "enable primary node failed") return err } ins.ReportLogs(constvar.FailResult, fmt.Sprintf("enable primary node[%s#%d] success", - primaryNode.Host, primaryNode.Port)) + newPrimaryNode.Host, newPrimaryNode.Port)) + + //set new primary node + ins.NewPrimaryTdbctl = newPrimaryNode } - //3. set primary - ins.SetPrimary(primaryNode) //4. get all route from primary node + if ins.NewPrimaryTdbctl != nil { + primaryHost = ins.NewPrimaryTdbctl.Host + primaryPort = ins.NewPrimaryTdbctl.Port + } else { + primaryHost = ins.PrimaryTdbctl.Host + primaryPort = ins.PrimaryTdbctl.Port + } log.Logger.Debugf("try to connect to primary tdbctl") - primaryConn, err := ins.ConnectPrimaryTdbctl() + primaryConn, err := ins.ConnectInstance(primaryHost, primaryPort) if err != nil { return err } defer func() { _ = primaryConn.Close() }() - ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("get all route table before switch")) + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("get all route table from primary before switch")) if ins.RouteTable, err = ins.QueryRouteInfo(primaryConn); err != nil { _ = primaryConn.Close() return fmt.Errorf("get all route info failed:%s", err.Error()) @@ -299,6 +304,37 @@ func (ins *SpiderProxyLayerSwitch) RollBack() error { } func (ins *SpiderProxyLayerSwitch) DoFinal() error { + if ins.PrimaryTdbctl.CurrentServer == 1 { + //whether all lived tdbctl do change master to + allNodeRepaired := true + newMaster := ins.NewPrimaryTdbctl + ins.ReportLogs(constvar.InfoResult, + "primary broke-down and elect success, try to repair new replication") + //1. reset slave on new primary + ins.ReportLogs(constvar.InfoResult, "do reset slave first") + if binlogFile, binlogPosition, err := ins.ResetSlaveExtend(newMaster.Host, newMaster.Port); err != nil { + ins.ReportLogs(constvar.FailResult, "new primary node do reset slave failed") + return err + } else { + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("before reset slave, consistent binlog info:%s,%d", + binlogFile, binlogPosition)) + } + //2. do change master to on all lived tdbctl nodes + changeSQL := fmt.Sprintf("change master to master_host='%s', master_port=%d, master_auto_position=1", + newMaster.Host, newMaster.Port) + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("do [%s] on all lived tdbctl nodes", changeSQL)) + for _, node := range ins.SecondaryNodes { + err := ins.ChangeMasterAuto(node.Host, node.Port, changeSQL) + if err != nil { + ins.ReportLogs(constvar.WarnResult, err.Error()) + allNodeRepaired = false + } + } + if !allNodeRepaired { + return fmt.Errorf("not all alived node change mastero to success") + } + } + return nil } diff --git a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderStorageLayer_switch.go b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderStorageLayer_switch.go index 15701b1a3b..49de3a5b62 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderStorageLayer_switch.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/SpiderStorageLayer_switch.go @@ -83,24 +83,28 @@ func (ins *SpiderStorageSwitch) CheckSwitch() (bool, error) { func (ins *SpiderStorageSwitch) DoSwitch() error { //1. get primary tdbctl node ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("get primary tdbctl node before switch")) - primaryNode, err := ins.GetPrimary() - if err != nil || primaryNode == nil { - ins.ReportLogs(constvar.FailResult, "get primary node failed") + + //2. set all spider nodes + if err := ins.SetSpiderNodes(); err != nil { return err } - //2. set primary - ins.SetPrimary(primaryNode) //3. get all route from primary node + if err := ins.GetPrimary(); err != nil { + ins.ReportLogs(constvar.FailResult, "get primary node failed") + return err + } + log.Logger.Debugf("try to connect to primary tdbctl") - primaryConn, err := ins.ConnectPrimaryTdbctl() + primaryConn, err := ins.ConnectInstance(ins.PrimaryTdbctl.Host, ins.PrimaryTdbctl.Port) if err != nil { return err } defer func() { _ = primaryConn.Close() }() - ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("get all route table before switch")) + + ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("get all route table from primary node before switch")) if ins.RouteTable, err = ins.QueryRouteInfo(primaryConn); err != nil { _ = primaryConn.Close() return fmt.Errorf("get all route info failed:%s", err.Error()) @@ -139,7 +143,7 @@ func (ins *SpiderStorageSwitch) DoSwitch() error { ins.ReportLogs(constvar.InfoResult, "one-phase: flush 1.1.1.1 to all spider success") ins.ReportLogs(constvar.InfoResult, "try to reset slave") - binlogFile, binlogPosition, err := ins.ResetSlave() + binlogFile, binlogPosition, err := ins.ResetSlaveExtend(ins.StandBySlave.Ip, ins.StandBySlave.Port) if err != nil { ins.ReportLogs(constvar.FailResult, fmt.Sprintf("reset slave failed:%s", err.Error())) return fmt.Errorf("reset slave failed")