-
Notifications
You must be signed in to change notification settings - Fork 15
/
data-clean_glitch.py
118 lines (111 loc) · 4.51 KB
/
data-clean_glitch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os, sys, glob
sys.path.append('/home/zhouyj/software/data_prep')
import numpy as np
from obspy import read, UTCDateTime
from signal_lib import preprocess
import warnings
warnings.filterwarnings("ignore")
# i/o paths
data_dirs = sorted(glob.glob('/data/Example_data/*'))
bak_dir = '/data/Example_data_bak'
fout = open('output/eg_data-glitch.csv','w')
# data prep
freq_band = [1,20]
win_sta, win_lta = 0.5, 1
min_snr = 100
min_gap_npts = 10
num_glitch_bak = 100
glitch_wid = 1. # sec
to_write = 0 # 0 - not to write; 1 - write >num_glitch_bak; 2 - write all
def calc_sta_lta(data, win_lta_npts, win_sta_npts):
npts = len(data)
if npts < win_lta_npts + win_sta_npts:
print('input data too short!')
return np.zeros(1)
sta = np.zeros(npts)
lta = np.ones(npts)
data_cum = np.cumsum(data)
sta[:-win_sta_npts] = data_cum[win_sta_npts:] - data_cum[:-win_sta_npts]
sta /= win_sta_npts
lta[win_lta_npts:] = data_cum[win_lta_npts:] - data_cum[:-win_lta_npts]
lta /= win_lta_npts
sta_lta = sta/lta
sta_lta[0:win_lta_npts] = 0.
sta_lta[np.isinf(sta_lta)] = 0.
sta_lta[np.isnan(sta_lta)] = 0.
return sta_lta
def remove_glitch(st):
# data prep
data_raw = st[0].data.copy()
start_time = st[0].stats.starttime
samp_rate = st[0].stats.sampling_rate
win_sta_npts = int(samp_rate*win_sta)
win_lta_npts = int(samp_rate*win_lta)
glitch_wid_npts = int(samp_rate*glitch_wid)
st = preprocess(st, samp_rate, freq_band)
data_2 = st[0].data**2
# forward & backward STA/LTA
sta_lta_0 = calc_sta_lta(data_2, win_lta_npts, win_sta_npts)
sta_lta_1 = calc_sta_lta(data_2[::-1], win_lta_npts, win_sta_npts)[::-1]
gap_idx = np.where(sta_lta_0>=min_snr)[0]
large_back = np.array(sta_lta_1>=min_snr, dtype=np.int32)
gap_list = np.split(gap_idx, np.where(np.diff(gap_idx)!=1)[0] + 1)
gap_list = [gap for gap in gap_list if len(gap)>0]
glitch_times = []
for gap in gap_list:
if sum(large_back[gap[-1]:gap[-1]+glitch_wid_npts])==0: continue
ref_idx = gap[-1] + glitch_wid_npts
idx0 = gap[0]
if sum(large_back[ref_idx:ref_idx+2*glitch_wid_npts]==0)==0: continue
idx1 = ref_idx + np.where(large_back[ref_idx:ref_idx+2*glitch_wid_npts]==0)[0][0]
t0, t1 = start_time + idx0/samp_rate, start_time + idx1/samp_rate
glitch_times.append([t0, t1])
# interploate fill
delta = (data_raw[idx1] - data_raw[idx0]) / (idx1-idx0)
interp_fill = np.array([data_raw[idx0] + ii*delta for ii in range(idx1-idx0)])
data_raw[idx0:idx1] = interp_fill
st[0].data = data_raw
return st, glitch_times
def remove_gap(st):
data = st[0].data
npts = len(data)
gap_idx = np.where(data==0)[0]
gap_list = np.split(gap_idx, np.where(np.diff(gap_idx)!=1)[0] + 1)
gap_list = [gap for gap in gap_list if len(gap)>=min_gap_npts]
num_gap = len(gap_list)
for ii,gap in enumerate(gap_list):
idx0, idx1 = max(0, gap[0]-1), min(npts-1, gap[-1]+1)
if ii<num_gap-1: idx2 = min(idx1+(idx1-idx0), gap_list[ii+1][0])
else: idx2 = min(idx1+(idx1-idx0), npts-1)
if idx1==idx2: continue
if idx2==idx1+(idx1-idx0): data[idx0:idx1] = data[idx1:idx2]
else:
num_tile = int(np.ceil((idx1-idx0)/(idx2-idx1)))
data[idx0:idx1] = np.tile(data[idx1:idx2], num_tile)[0:idx1-idx0] st[0].data = data
st[0].data = data
return st
for data_dir in data_dirs:
print('cleaning', data_dir)
stz_paths = sorted(glob.glob(os.path.join(data_dir,'*Z.%s'%data_format)))
for stz_path in stz_paths:
stz_dir, fname = os.path.split(stz_path)
net, sta = fname.split('.')[1:3]
st_paths = sorted(glob.glob('%s/*.%s.%s.*'%(stz_dir,net,sta)))
is_bad, st_raw = False, []
for st_path in st_paths:
try: st = read(st_path)
except: print('error in reading file', st_path); continue
bak_path = os.path.join(bak_dir, os.path.basename(st_path))
st_raw.append([st.copy(), bak_path])
st = remove_gap(st)
st, glitch_times = remove_glitch(st)
num_glitch = len(glitch_times)
for [t0,t1] in glitch_times: fout.write('%s.%s,%s,%s\n'%(net,sta,t0,t1))
if num_glitch>=num_glitch_bak:
is_bad = True
if to_write==1: st.write(st_path)
elif num_glitch>0:
if to_write==2: st.write(st_path)
if is_bad:
for st, bak_path in st_raw: st.write(bak_path)
fout.close()