Safemotion Lib
Loading...
Searching...
No Matches
cmdm.py
Go to the documentation of this file.
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# @Date : 2020-10-13 14:43:58
4# @Author : Dengpan Fu (t-defu@microsoft.com)
5
6import os
7import os.path as osp
8import re, tarfile, shutil
9import warnings
10import numpy as np
11from glob import glob
12
13from .bases import ImageDataset
14from ..datasets import DATASET_REGISTRY
15
16NAME_DICT = {
17 'msmt': ('msmt17', ),
18 'msmt17': ('msmt17', ),
19 'duke': ('duke', ),
20 'dukemtmc': ('duke', ),
21 'market': ('market1501', ),
22 'market1501': ('market1501', ),
23 'cuhk': ('cuhk03_np', 'labeled'),
24 'cuhk03': ('cuhk03_np', 'labeled'),
25 'cuhk-lab': ('cuhk03_np', 'labeled'),
26 'cuhk03-lab': ('cuhk03_np','labeled'),
27 'cuhk-det': ('cuhk03_np', 'detected'),
28 'cuhk03-det': ('cuhk03_np', 'detected'),
29}
30EXTS = ['*.jpg', '*.png', '*.jpeg', '*.bmp', '*.ppm']
31
32@DATASET_REGISTRY.register()
34 """ Dataset class for CUHK03-NP(C), Market1501(M), Duke(D), MSMT(M)
35 with specical attributes.
36 """
37 def __init__(self, root='datasets', data_name='duke', split_mode='ori',
38 split_ratio=1.0, repeat_ratio=1.0, tgz_data=None, **kwargs):
39 # self.root = osp.abspath(osp.expanduser(root))
40 self.base_root = root
41 self.data_name = data_name
42 self.split_mode = split_mode
43 self.split_ratio = split_ratio
44 self.repeat_ratio = repeat_ratio
45 self.tgz_data = tgz_data
46 self.dataset_dir = osp.join(self.base_root, *NAME_DICT[self.data_name])
47
48 self.train_dir = osp.join(self.dataset_dir, 'bounding_box_train')
49 self.query_dir = osp.join(self.dataset_dir, 'query')
50 self.gallery_dir = osp.join(self.dataset_dir, 'bounding_box_test')
51
52 self.used_id_files = os.path.join(self.dataset_dir, 'few_ids',
53 f'used_ids_{self.split_ratio:.02f}.txt')
54 self.used_im_files = os.path.join(self.dataset_dir, 'few_ims',
55 f'used_ims_{self.split_ratio:.02f}.txt')
56
57 self.prepare_data()
58
59 required_files = [
60 self.dataset_dir,
61 self.train_dir,
62 self.query_dir,
63 self.gallery_dir,
64 ]
65 self.check_before_run(required_files)
66
67 train = self.preprocess(self.train_dir, is_train=True)
68 query = self.preprocess(self.query_dir, relabel=False)
69 gallery = self.preprocess(self.gallery_dir, relabel=False)
70
71 super(CMDM, self).__init__(train, query, gallery, **kwargs)
72
73
75 has_train_dir = os.path.isdir(self.train_dir)
76 has_query_dir = os.path.isdir(self.query_dir)
77 has_gallery_dir = os.path.isdir(self.gallery_dir)
78 has_used_im_file = os.path.isfile(self.used_im_files)
79 has_used_id_file = os.path.isfile(self.used_id_files)
80 valid = (has_train_dir & has_gallery_dir & has_query_dir &
81 has_used_id_file & has_used_im_file)
82 return valid
83
84 def prepare_data(self):
85 if self.check_data_folder():
86 print(f"Data[{self.data_name}] is prepared at {self.dataset_dir}")
87 return True
88 else:
89 print(f'Preparing [{self.data_name}] ...')
90 if self.tgz_data is None or not os.path.exists(self.tgz_data):
91 print(f"No tgz[{self.tgz_data}] data provided for data preparing")
92 raise IOError(f'No vaild tgz[{self.tgz_data}] data provided')
93 if not os.path.exists(self.base_root):
94 os.makedirs(self.base_root)
95 tgz_dst = os.path.join(self.base_root, os.path.basename(self.tgz_data))
96 if not os.path.isfile(tgz_dst):
97 print(f"Coping {self.tgz_data} to {tgz_dst}")
98 shutil.copy(self.tgz_data, tgz_dst)
99 with tarfile.open(tgz_dst) as tar:
100 print(f"Extracting {tgz_dst}")
101 tar.extractall(self.base_root)
102 return True
103
104 def preprocess(self, path, relabel=True, is_train=False):
105 if is_train and self.split_mode == 'id':
106 return self.process_train_id_mode(path)
107 if is_train and self.split_mode == 'im':
108 return self.process_train_im_mode(path)
109
110 pattern = re.compile(r'([-\d]+)_c(\d+)')
111 all_pids, all_cids = {}, {}
112 ret, fpaths = [], []
113 for ext in EXTS:
114 fpaths.extend(glob(os.path.join(path, ext)))
115 fpaths = sorted(fpaths)
116 for fpath in fpaths:
117 fname = os.path.basename(fpath)
118 pid, cid = map(int, pattern.search(fname).groups())
119 if pid == -1: continue
120 if relabel:
121 if pid not in all_pids:
122 all_pids[pid] = len(all_pids)
123 else:
124 if pid not in all_pids:
125 all_pids[pid] = pid
126 if cid not in all_cids:
127 all_cids[cid] = cid
128 pid = all_pids[pid]
129 cid -= 1
130 ret.append((fpath, pid, cid))
131 return ret
132
133 def process_train_id_mode(self, path, relabel=True):
134 pattern = re.compile(r'([-\d]+)_c(\d+)')
135 fpaths = []
136 for ext in EXTS:
137 fpaths.extend(glob(os.path.join(path, ext)))
138 fpaths = sorted(fpaths)
139
140 pid_container = set()
141 for fpath in fpaths:
142 pid, _ = map(int, pattern.search(fpath).groups())
143 if pid == -1: continue # junk images are just ignored
144 pid_container.add(pid)
145
146 ids = []
147 if os.path.isfile(self.used_id_files):
148 with open(self.used_id_files, 'r') as f:
149 lines = f.readlines()
150 ids = [int(line.strip()) for line in lines if line.strip()]
151 if not len(ids) == int(len(pid_container) * self.split_ratio):
152 ids = []
153 else:
154 print(f"Loading split info with [mode={self.split_mode}]"
155 f" from {self.used_id_files}")
156 if len(ids) < 1:
157 num = int(len(pid_container) * self.split_ratio)
158 choose_ids = np.random.choice(list(pid_container), num, replace=False)
159 ids = sorted(choose_ids)
160 with open(self.used_id_files, 'w') as f:
161 for iidd in ids:
162 f.write(f'{iidd:d} \n')
163 print(f"Saving split info to {self.used_id_files}")
164
165 pid2label = {pid: label for label, pid in enumerate(ids)}
166 all_pids, all_cids = [], []
167
168 dataset = []
169 for fpath in fpaths:
170 pid, camid = map(int, pattern.search(fpath).groups())
171 if pid == -1: continue # junk images are just ignored
172 camid -= 1 # index starts from 0
173 if not pid in ids: continue
174 if relabel: pid = pid2label[pid]
175 if not pid in all_pids: all_pids.append(pid)
176 if not camid in all_cids: all_cids.append(camid)
177 dataset.append((fpath, pid, camid))
178
179 return sorted(dataset)
180
181 def process_train_im_mode(self, path, relabel=True):
182 pattern = re.compile(r'([-\d]+)_c(\d+)')
183 fpaths = []
184 for ext in EXTS:
185 fpaths.extend(glob(os.path.join(path, ext)))
186 fpaths = sorted(fpaths)
187
188 dataset = []
189 if os.path.isfile(self.used_im_files):
190 with open(self.used_im_files, 'r') as f:
191 lines = f.readlines()
192 if len(lines) > 2:
193 for line in lines:
194 if line:
195 name, pid, camid = line.strip().split('; ')
196 dataset.append([name, int(pid), int(camid)])
197 print(f"Loading split data with [mode={self.split_mode}]"
198 f" from {self.used_im_files}")
199 if len(dataset) < 2:
200 dataset = []
201 pid_dict = {}
202 for fpath in fpaths:
203 pid, _ = map(int, pattern.search(fpath).groups())
204 if pid == -1: continue # junk images are just ignored
205 if pid in pid_dict:
206 pid_dict[pid].append(os.path.basename(fpath))
207 else:
208 pid_dict[pid] = [os.path.basename(fpath)]
209 pid2label = {pid: label for label, pid in enumerate(sorted(pid_dict.keys()))}
210 for key, value in pid_dict.items():
211 num = int(max(np.round(len(value) * self.split_ratio), 1))
212 chooses = np.random.choice(value, num, replace=False)
213 pid = key
214 if relabel: pid = pid2label[pid]
215 for choose in chooses:
216 name = str(choose)
217 _, camid = map(int, pattern.search(name).groups())
218 dataset.append([name, pid, camid -1])
219 with open(self.used_im_files, 'w') as f:
220 for item in dataset:
221 f.write(f"{item[0]:s}; {item[1]:d}; {item[2]:d} \n")
222 print(f"Saving split info to {self.used_im_files}")
223
224 all_pids, all_cids = [], []
225 for item in dataset:
226 item[0] = os.path.join(path, item[0])
227 if not item[1] in all_pids: all_pids.append(item[1])
228 if not item[2] in all_cids: all_cids.append(item[2])
229
230 nd = []
231 if self.repeat_ratio > 1:
232 nd = dataset * int(self.repeat_ratio)
233 remain = self.repeat_ratio - int(self.repeat_ratio)
234 if remain > 0:
235 end = int(remain * len(dataset))
236 nd.extend(dataset[:end])
237 dataset = nd
238
239 return sorted(dataset)
check_before_run(self, required_files)
Definition bases.py:113
process_train_id_mode(self, path, relabel=True)
Definition cmdm.py:133
preprocess(self, path, relabel=True, is_train=False)
Definition cmdm.py:104
process_train_im_mode(self, path, relabel=True)
Definition cmdm.py:181
__init__(self, root='datasets', data_name='duke', split_mode='ori', split_ratio=1.0, repeat_ratio=1.0, tgz_data=None, **kwargs)
Definition cmdm.py:38