Skip to content

Commit

Permalink
[PDR-16012][feat]logkit发送接口延迟,内部queue长度指标开发
Browse files Browse the repository at this point in the history
  • Loading branch information
shangmin-001 committed Jan 14, 2022
1 parent dc122cc commit 1fb65f3
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 20 deletions.
15 changes: 10 additions & 5 deletions mgr/metric_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ func (r *MetricRunner) Run() {
dataCnt := 0
datas := make([]Data, 0)
metricTime := time.Now()
tags[metric.Timestamp] = metricTime.Format(time.RFC3339Nano)
tags[metric.Timestamp] = metricTime.UnixNano()/1e6
for _, c := range r.collectors {
metricName := c.Name()
tmpdatas, err := c.Collect()
Expand Down Expand Up @@ -610,10 +610,14 @@ func (mr *MetricRunner) StatusRestore() {
}
sStatus, ok := s.(sender.StatsSender)
if ok {
sStatus.Restore(&StatsInfo{
statsInfo:=&StatsInfo{
Success: info[0],
Errors: info[1],
})
}
if len(info)>2{
statsInfo.FtSendLag=info[2]
}
sStatus.Restore(statsInfo)
}
status, ext := mr.rs.SenderStats[name]
if !ext {
Expand All @@ -635,7 +639,7 @@ func (mr *MetricRunner) StatusBackup() {
status.ParserStats.Success,
status.ParserStats.Errors,
},
SenderCnt: map[string][2]int64{},
SenderCnt: map[string][]int64{},
}
for _, s := range mr.senders {
name := s.Name()
Expand All @@ -646,9 +650,10 @@ func (mr *MetricRunner) StatusBackup() {
status.SenderStats[name] = senderStats
}
if sta, exist := status.SenderStats[name]; exist {
bStart.SenderCnt[name] = [2]int64{
bStart.SenderCnt[name] = []int64{
sta.Success,
sta.Errors,
sta.FtSendLag,
}
}
}
Expand Down
16 changes: 11 additions & 5 deletions mgr/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1476,10 +1476,14 @@ func (r *LogExportRunner) StatusRestore() {
}
sStatus, ok := s.(sender.StatsSender)
if ok {
sStatus.Restore(&StatsInfo{
statsInfo:=&StatsInfo{
Success: info[0],
Errors: info[1],
})
}
if len(info)>2{
statsInfo.FtSendLag=info[2]
}
sStatus.Restore(statsInfo)
}
status, ext := r.rs.SenderStats[name]
if !ext {
Expand Down Expand Up @@ -1519,7 +1523,7 @@ func (r *LogExportRunner) StatusBackup() {
status.ParserStats.Errors,
},
TransCnt: map[string][2]int64{},
SenderCnt: map[string][2]int64{},
SenderCnt: map[string][]int64{},
}
r.historyMutex.Lock()
defer r.historyMutex.Unlock()
Expand All @@ -1535,9 +1539,10 @@ func (r *LogExportRunner) StatusBackup() {
for idx, t := range r.transformers {
name := formatTransformName(t.Type(), idx)
sta := t.Stats()
bStart.SenderCnt[name] = [2]int64{
bStart.SenderCnt[name] = []int64{
sta.Success,
sta.Errors,
sta.FtSendLag,
}
}

Expand All @@ -1563,9 +1568,10 @@ func (r *LogExportRunner) StatusBackup() {
status.SenderStats[name] = senderStats
}
if sta, exist := status.SenderStats[name]; exist {
bStart.SenderCnt[name] = [2]int64{
bStart.SenderCnt[name] = []int64{
sta.Success,
sta.Errors,
sta.FtSendLag,
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion reader/meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const (
type Statistic struct {
ReaderCnt int64 `json:"reader_count"` // 读取总条数
ParserCnt [2]int64 `json:"parser_connt"` // [解析成功, 解析失败]
SenderCnt map[string][2]int64 `json:"sender_count"` // [发送成功, 发送失败]
SenderCnt map[string][]int64 `json:"sender_count"` // [发送成功, 发送失败]
TransCnt map[string][2]int64 `json:"transform_count"` // [解析成功, 解析失败]
ReadErrors ErrorStatistic `json:"read_errors"`
ParseErrors ErrorStatistic `json:"parse_errors"`
Expand Down
2 changes: 1 addition & 1 deletion reader/meta_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func TestMeta(t *testing.T) {
stat := &Statistic{
ReaderCnt: 6,
ParserCnt: [2]int64{6, 8},
SenderCnt: map[string][2]int64{
SenderCnt: map[string][]int64{
"aaa": {1, 2},
"bbb": {5, 6},
},
Expand Down
131 changes: 123 additions & 8 deletions sender/fault_tolerant.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
Expand All @@ -32,6 +34,12 @@ const (
defaultMaxProcs = 1 // 默认没有并发
// TypeMarshalError 表示marshal出错
TypeMarshalError = reqerr.SendErrorType("Data Marshal failed")
// KeyUnMarshalError
KeyUnMarshalError = "Data unmarshal failed"
// NumUnMarshalError
NumUnMarshalError = 10
// lag file
LagFilename = "meta.lag"
)

var _ SkipDeepCopySender = &FtSender{}
Expand Down Expand Up @@ -202,6 +210,9 @@ func newFtSender(innerSender Sender, runnerName string, opt *FtOption) (*FtSende
isBlock: opt.isBlock,
backoff: utils.NewBackoff(2, 1, 1*time.Second, 5*time.Minute),
}
ftSender.statsMutex.Lock()
ftSender.stats.FtSendLag = ftSender.readLag()
ftSender.statsMutex.Unlock()

if opt.innerSenderType == TypePandora {
ftSender.pandoraKeyCache = make(map[string]KeyInfo)
Expand Down Expand Up @@ -269,9 +280,17 @@ func (ft *FtSender) RawSend(datas []string) error {
} else {
// se 中的 lasterror 和 senderror 都为空,需要使用 se.FtQueueLag
se.AddSuccessNum(len(datas))
ft.statsMutex.Lock()
ft.stats.FtSendLag = ft.stats.FtSendLag + int64(len(datas))
ft.statsMutex.Unlock()
ft.backoff.Reset()
}
se.FtQueueLag = ft.BackupQueue.Depth() + ft.logQueue.Depth()
if se.FtQueueLag == 0 {
ft.statsMutex.Lock()
ft.stats.FtSendLag = 0
ft.statsMutex.Unlock()
}
}
return se
}
Expand Down Expand Up @@ -350,9 +369,17 @@ func (ft *FtSender) Send(datas []Data) error {
} else {
// se 中的 lasterror 和 senderror 都为空,需要使用 se.FtQueueLag
se.AddSuccessNum(len(datas))
ft.statsMutex.Lock()
ft.stats.FtSendLag = ft.stats.FtSendLag + int64(len(datas))
ft.statsMutex.Unlock()
ft.backoff.Reset()
}
se.FtQueueLag = ft.BackupQueue.Depth() + ft.logQueue.Depth()
if se.FtQueueLag == 0 {
ft.statsMutex.Lock()
ft.stats.FtSendLag = 0
ft.statsMutex.Unlock()
}
return se
}

Expand Down Expand Up @@ -391,6 +418,9 @@ func (ft *FtSender) Close() error {
// persist queue's meta data
ft.logQueue.Close()
ft.BackupQueue.Close()
ft.statsMutex.Lock()
ft.writeLag(ft.stats.FtSendLag)
ft.statsMutex.Unlock()

return ft.innerSender.Close()
}
Expand Down Expand Up @@ -477,6 +507,9 @@ func (ft *FtSender) saveToFile(datas []Data) error {
}

func (ft *FtSender) asyncSendLogFromQueue() {
// if not sleep, queue lag may be cleared
time.Sleep(time.Second * 10)

for i := 0; i < ft.procs; i++ {
if ft.opt.sendRaw {
readLinesChan := make(<-chan []string)
Expand All @@ -502,18 +535,31 @@ func (ft *FtSender) asyncSendLogFromQueue() {
}

// trySend 从bytes反序列化数据后尝试发送数据
func (ft *FtSender) trySendBytes(dat []byte, failSleep int, isRetry bool) (backDataContext []*datasContext, err error) {
func (ft *FtSender) trySendBytes(dat []byte, failSleep int, isRetry bool, isFromQueue bool) (backDataContext []*datasContext, err error) {
if ft.opt.sendRaw {
datas, err := ft.unmarshalRaws(dat)
if err != nil {
return nil, err
return nil, errors.New(KeyUnMarshalError + ":" + err.Error())
}
ft.statsMutex.Lock()
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
if ft.stats.FtSendLag < 0 {
ft.stats.FtSendLag = 0
}
ft.statsMutex.Unlock()

return ft.backOffSendRawFromQueue(datas, failSleep, isRetry)
}
datas, err := ft.unmarshalData(dat)
if err != nil {
return nil, err
return nil, errors.New(KeyUnMarshalError + ":" + err.Error())
}
ft.statsMutex.Lock()
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
if ft.stats.FtSendLag < 0 {
ft.stats.FtSendLag = 0
}
ft.statsMutex.Unlock()

return ft.backOffSendFromQueue(datas, failSleep, isRetry)
}
Expand Down Expand Up @@ -562,6 +608,9 @@ func (ft *FtSender) trySendRaws(datas []string, failSleep int, isRetry bool) (ba
log.Errorf("Runner[%v] Sender[%v] cannot write points back to queue %v: %v, discard datas %d", ft.runnerName, ft.innerSender.Name(), ft.BackupQueue.Name(), err, len(datas))
return nil, nil
}
ft.statsMutex.Lock()
ft.stats.FtSendLag += int64(len(v.Lines))
ft.statsMutex.Unlock()
}

time.Sleep(time.Second * time.Duration(math.Pow(2, float64(failSleep))))
Expand Down Expand Up @@ -616,6 +665,9 @@ func (ft *FtSender) trySendDatas(datas []Data, failSleep int, isRetry bool) (bac
log.Errorf("Runner[%v] Sender[%v] cannot write points back to queue %v: %v, discard datas %d", ft.runnerName, ft.innerSender.Name(), ft.BackupQueue.Name(), err, len(datas))
return nil, nil
}
ft.statsMutex.Lock()
ft.stats.FtSendLag += int64(len(v.Datas))
ft.statsMutex.Unlock()
}

time.Sleep(time.Second * time.Duration(math.Pow(2, float64(failSleep))))
Expand Down Expand Up @@ -876,6 +928,7 @@ func (ft *FtSender) sendRawFromQueue(queueName string, readChan <-chan []byte, r
timer := time.NewTicker(time.Second)
defer timer.Stop()
numWaits := 1
unmarshalDataError := 0
var curDataContext, otherDataContext []*datasContext
var curIdx int
var backDataContext []*datasContext
Expand All @@ -891,8 +944,14 @@ func (ft *FtSender) sendRawFromQueue(queueName string, readChan <-chan []byte, r
} else {
select {
case bytes := <-readChan:
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry)
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry, true)
case datas := <-readDatasChan:
ft.statsMutex.Lock()
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
if ft.stats.FtSendLag < 0 {
ft.stats.FtSendLag = 0
}
ft.statsMutex.Unlock()
backDataContext, err = ft.backOffSendRawFromQueue(datas, numWaits, isRetry)
case <-timer.C:
continue
Expand All @@ -908,6 +967,15 @@ func (ft *FtSender) sendRawFromQueue(queueName string, readChan <-chan []byte, r
if numWaits > 5 {
numWaits = 5
}
if strings.HasPrefix(err.Error(), KeyUnMarshalError) {
unmarshalDataError++
if unmarshalDataError > NumUnMarshalError {
time.Sleep(time.Second)
log.Errorf("Runner[%s] Sender[%s] sleep 1s due to unmarshal err", ft.runnerName, ft.innerSender.Name(), queueName, err)
}
} else {
unmarshalDataError = 0
}
}
if backDataContext != nil {
otherDataContext = append(otherDataContext, backDataContext...)
Expand All @@ -924,6 +992,7 @@ func (ft *FtSender) sendFromQueue(queueName string, readChan <-chan []byte, read
timer := time.NewTicker(time.Second)
defer timer.Stop()
numWaits := 1
unmarshalDataError := 0
var curDataContext, otherDataContext []*datasContext
var curIdx int
var backDataContext []*datasContext
Expand All @@ -939,8 +1008,14 @@ func (ft *FtSender) sendFromQueue(queueName string, readChan <-chan []byte, read
} else {
select {
case bytes := <-readChan:
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry)
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry, true)
case datas := <-readDatasChan:
ft.statsMutex.Lock()
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
if ft.stats.FtSendLag < 0 {
ft.stats.FtSendLag = 0
}
ft.statsMutex.Unlock()
backDataContext, err = ft.backOffSendFromQueue(datas, numWaits, isRetry)
case <-timer.C:
continue
Expand All @@ -956,6 +1031,15 @@ func (ft *FtSender) sendFromQueue(queueName string, readChan <-chan []byte, read
if numWaits > 5 {
numWaits = 5
}
if strings.HasPrefix(err.Error(), KeyUnMarshalError) {
unmarshalDataError++
if unmarshalDataError > NumUnMarshalError {
time.Sleep(time.Second)
log.Errorf("Runner[%s] Sender[%s] sleep 1s due to unmarshal err", ft.runnerName, ft.innerSender.Name(), queueName, err)
}
} else {
unmarshalDataError = 0
}
}
if backDataContext != nil {
otherDataContext = append(otherDataContext, backDataContext...)
Expand Down Expand Up @@ -993,8 +1077,8 @@ func SplitData(data string) (valArray []string) {
valArray = SplitDataWithSplitSize(valArray, data[start:offset], DefaultSplitSize)
if len(valArray) > 0 {
// 最后一个分片参与下次split
start = offset - len(valArray[len(valArray) - 1])
valArray = valArray[:len(valArray) - 1]
start = offset - len(valArray[len(valArray)-1])
valArray = valArray[:len(valArray)-1]
}
continue
}
Expand All @@ -1017,7 +1101,7 @@ func SplitDataWithSplitSize(originArray []string, data string, splitSize int64)
if len(originArray) != 0 {
num := (DefaultMaxBatchSize - int64(len(originArray[len(originArray)-1]))) / splitSize
if num > 0 {
end := num*splitSize
end := num * splitSize
if end > int64(len(data)) {
end = int64(len(data))
}
Expand Down Expand Up @@ -1200,3 +1284,34 @@ func (ft *FtSender) backOffReTrySendRaw(lines []string, isRetry bool) (res []*da
time.Sleep(backoff.Duration())
}
}

// readLag read lag from file
func (ft *FtSender) readLag() int64 {
path := filepath.Join(ft.opt.saveLogPath, LagFilename)
f, err := ioutil.ReadFile(path)
if err != nil {
log.Errorf("Runner[%v] Sender[%v] read file error : %v", ft.runnerName, ft.innerSender.Name(), err)
return 0
}
lag, err := strconv.ParseInt(string(f), 10, 64)
if err != nil {
log.Errorf("Runner[%v] Sender[%v] parse lag error : %v", ft.runnerName, ft.innerSender.Name(), err)
}
return lag
}

// writeLag write lag into file
func (ft *FtSender) writeLag(lag int64) error {
path := filepath.Join(ft.opt.saveLogPath, LagFilename)
file, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
defer func() {
file.Sync()
file.Close()
}()
if err != nil {
return err
}
lagStr := strconv.FormatInt(lag, 10)
_, err = file.WriteString(lagStr)
return err
}
Loading

0 comments on commit 1fb65f3

Please sign in to comment.