leuschnm commited on
Commit
af01b55
·
1 Parent(s): 994ce72

add fixes to input tensor

Browse files
Files changed (5) hide show
  1. IMG_1.jpg +0 -0
  2. IMG_2.jpg +0 -0
  3. IMG_3.jpg +0 -0
  4. app.py +71 -35
  5. model.py +1 -1
IMG_1.jpg CHANGED
IMG_2.jpg CHANGED
IMG_3.jpg CHANGED
app.py CHANGED
@@ -15,70 +15,106 @@
15
  import os
16
  import numpy as np
17
  import torch
18
- from model import SASNet
19
  import warnings
20
  import random
21
  import matplotlib.pyplot as plt
22
  import gradio as gr
 
 
 
23
 
24
  warnings.filterwarnings('ignore')
25
 
26
  # define the GPU id to be used
27
  os.environ['CUDA_VISIBLE_DEVICES'] = '0'
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def predict(img):
30
  """the main process of inference"""
31
  test_loader = loading_data(img)
32
-
33
- model = SASNet(batch_size=4, log_para=1000, block_size=32).cuda()
34
- model_path = "SHHA.pth"
35
  # load the trained model
36
  model.load_state_dict(torch.load(model_path))
37
  print('successfully load model from', model_path)
38
 
39
  with torch.no_grad():
40
  model.eval()
41
-
42
- img = img.convert('RGB')
43
- transform = standard_transforms.Compose([
44
- standard_transforms.ToTensor(), standard_transforms.Normalize(mean=[0.485, 0.456, 0.406],
45
- std=[0.229, 0.224, 0.225]),])
46
- img = transform(img)
47
- img = torch.Tensor(img)
48
-
49
- img = img.cuda()
50
- pred_map = model(img)
51
-
52
- pred_map = pred_map.data.cpu().numpy()
53
- pred_cnt = np.sum(pred_map[i_img]) / 1000
54
-
55
- den_map = np.squeeze(pred_map[i_img])
56
- fig = plt.figure(frameon=False)
57
- ax = plt.Axes(fig, [0., 0., 1., 1.])
58
- ax.set_axis_off()
59
- fig.add_axes(ax)
60
- ax.imshow(den_map, aspect='auto')
61
- return (pred_cnt, fig)
62
 
63
  with gr.Blocks() as demo:
64
  gr.Markdown("""
65
  # Crowd Counting based on SASNet
66
 
67
- We implemented a image crowd counting model with VGG16 following the paper of Song et. al (2021).
 
 
 
 
 
68
 
69
  ## References
70
  Song, Q., Wang, C., Wang, Y., Tai, Y., Wang, C., Li, J., … Ma, J. (2021). To Choose or to Fuse? Scale Selection for Crowd Counting. The Thirty-Fifth AAAI Conference on Artificial Intelligence (AAAI-21).
71
  """)
72
  image_button = gr.Button("Count the Crowd!")
73
  with gr.Row():
74
- with gr.Column():
75
- image_input = gr.Image(type="pil")
76
- gr.Examples(["IMG_1.jpg", "IMG_2.jpg", "IMG_3.jpg"], image_input)
77
- with gr.Column():
78
- text_output = gr.Label()
79
- image_output = gr.Plot()
80
-
81
-
 
82
  image_button.click(predict, inputs=image_input, outputs=[text_output, image_output])
83
 
84
- demo.launch()
 
 
15
  import os
16
  import numpy as np
17
  import torch
 
18
  import warnings
19
  import random
20
  import matplotlib.pyplot as plt
21
  import gradio as gr
22
+ import torchvision.transforms as standard_transforms
23
+ from torch.utils.data import DataLoader
24
+ from torch.utils.data import Dataset
25
 
26
  warnings.filterwarnings('ignore')
27
 
28
  # define the GPU id to be used
29
  os.environ['CUDA_VISIBLE_DEVICES'] = '0'
30
 
31
+ class data(Dataset):
32
+ def __init__(self, img, transform=None):
33
+ self.image = img
34
+ self.transform = transform
35
+
36
+ def __len__(self):
37
+ return 1000
38
+
39
+ def __getitem__(self, x):
40
+ # open image here as PIL / numpy
41
+ image = self.image
42
+ image = image.convert('RGB')
43
+ if self.transform is not None:
44
+ image = self.transform(image)
45
+
46
+ image = torch.Tensor(image)
47
+ return image
48
+
49
+ def loading_data(img):
50
+ # the augumentations
51
+ transform = standard_transforms.Compose([
52
+ standard_transforms.ToTensor(), standard_transforms.Normalize(mean=[0.485, 0.456, 0.406],
53
+ std=[0.229, 0.224, 0.225]),
54
+ ])
55
+ # dcreate the dataset
56
+ test_set = data(img=img, transform=transform)
57
+ test_loader = DataLoader(test_set, batch_size=1, num_workers=4, shuffle=False, drop_last=False)
58
+
59
+ return test_loader
60
+
61
+
62
  def predict(img):
63
  """the main process of inference"""
64
  test_loader = loading_data(img)
65
+ model = SASNet().cuda()
66
+ model_path = "./SHHA.pth"
 
67
  # load the trained model
68
  model.load_state_dict(torch.load(model_path))
69
  print('successfully load model from', model_path)
70
 
71
  with torch.no_grad():
72
  model.eval()
73
+
74
+ for vi, data in enumerate(test_loader, 0):
75
+ img = data
76
+ img = img.cuda()
77
+ pred_map = model(img)
78
+ pred_map = pred_map.data.cpu().numpy()
79
+ for i_img in range(pred_map.shape[0]):
80
+ pred_cnt = np.sum(pred_map[i_img]) / 1000
81
+
82
+ den_map = np.squeeze(pred_map[i_img])
83
+ fig = plt.figure(frameon=False)
84
+ ax = plt.Axes(fig, [0., 0., 1., 1.])
85
+ ax.set_axis_off()
86
+ fig.add_axes(ax)
87
+ ax.imshow(den_map, aspect='auto')
88
+
89
+ return pred_cnt, fig
90
+
 
 
 
91
 
92
  with gr.Blocks() as demo:
93
  gr.Markdown("""
94
  # Crowd Counting based on SASNet
95
 
96
+ We implemented a image crowd counting model with VGG16 following the paper of Song et. al (2021).
97
+
98
+ ## Abstract
99
+ In this paper, we address the large scale variation problem in crowd counting by taking full advantage of the multi-scale feature representations in a multi-level network. We implement such an idea by keeping the counting error of a patch as small as possible with a proper feature level selection strategy, since a specific feature level tends to perform better for a certain range of scales. However, without scale annotations, it is sub-optimal and error-prone to manually assign the predictions for heads of different scales to specific feature levels. Therefore, we propose a Scale-Adaptive Selection Network (SASNet), which automatically learns the internal correspondence between the scales and the feature levels. Instead of directly using the predictions from the most appropriate feature level as the final estimation, our SASNet also considers the predictions from other feature levels via weighted average, which helps to mitigate the gap between discrete feature levels and continuous scale variation. Since the heads in a local patch share roughly a same scale, we conduct the adaptive selection strategy in a patch-wise style. However, pixels within a patch contribute different counting errors due to the various difficulty degrees of learning. Thus, we further propose a Pyramid Region Awareness Loss (PRA Loss) to recursively select the most hard sub-regions within a patch until reaching the pixel level. With awareness of whether the parent patch is over-estimated or under-estimated, the fine-grained optimization with the PRA Loss for these region-aware hard pixels helps to alleviate the inconsistency problem between training target and evaluation metric. The state-of-the-art results on four datasets demonstrate the superiority of our approach.
100
+
101
+ The code will be available at: https://github.com/TencentYoutuResearch/CrowdCounting-SASNet.
102
 
103
  ## References
104
  Song, Q., Wang, C., Wang, Y., Tai, Y., Wang, C., Li, J., … Ma, J. (2021). To Choose or to Fuse? Scale Selection for Crowd Counting. The Thirty-Fifth AAAI Conference on Artificial Intelligence (AAAI-21).
105
  """)
106
  image_button = gr.Button("Count the Crowd!")
107
  with gr.Row():
108
+ with gr.Column():
109
+ image_input = gr.Image(type="pil")
110
+ gr.Examples(["IMG_1.jpg", "IMG_2.jpg", "IMG_3.jpg"], image_input)
111
+ with gr.Column():
112
+ image_output = gr.Plot()
113
+ with gr.Column():
114
+ text_output = gr.Label()
115
+
116
+
117
  image_button.click(predict, inputs=image_input, outputs=[text_output, image_output])
118
 
119
+ demo.launch(debug = True)
120
+
model.py CHANGED
@@ -133,7 +133,7 @@ class SASNet(nn.Module):
133
  Conv2d(32, 1, 1, same_padding=True, NL=None)
134
  )
135
 
136
- self.block_size = args.block_size
137
  # the forward process
138
  def forward(self, x):
139
  size = x.size()
 
133
  Conv2d(32, 1, 1, same_padding=True, NL=None)
134
  )
135
 
136
+ self.block_size = 32
137
  # the forward process
138
  def forward(self, x):
139
  size = x.size()