Sebastien De Greef
commited on
Commit
·
d5c3317
1
Parent(s):
7d64a6c
Create index.html for AI portfolio and add projects.json
Browse filesAdd object detection page with YOLO implementation
Add support for PNG images in Git LFS
- .gitattributes +1 -0
- images/object-detection-image1.PNG +3 -0
- images/object-detection-segmentation-1.PNG +3 -0
- images/object-detection-segmentation-2.PNG +3 -0
- images/object-detection-segmentation-3.PNG +3 -0
- images/object-detection-training-results.png +3 -0
- index.html +44 -10
- linkedin.png +3 -0
- object-detection.html +71 -0
- object-detection.md +248 -0
- projects.json +11 -0
- style.css +172 -18
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
images/object-detection-image1.PNG
ADDED
|
Git LFS Details
|
images/object-detection-segmentation-1.PNG
ADDED
|
Git LFS Details
|
images/object-detection-segmentation-2.PNG
ADDED
|
Git LFS Details
|
images/object-detection-segmentation-3.PNG
ADDED
|
Git LFS Details
|
images/object-detection-training-results.png
ADDED
![]() |
Git LFS Details
|
index.html
CHANGED
@@ -3,17 +3,51 @@
|
|
3 |
<head>
|
4 |
<meta charset="utf-8" />
|
5 |
<meta name="viewport" content="width=device-width" />
|
6 |
-
<title>
|
7 |
<link rel="stylesheet" href="style.css" />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
</head>
|
9 |
<body>
|
10 |
-
<
|
11 |
-
<h1>Welcome to
|
12 |
-
<p>
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
</body>
|
19 |
-
</html>
|
|
|
3 |
<head>
|
4 |
<meta charset="utf-8" />
|
5 |
<meta name="viewport" content="width=device-width" />
|
6 |
+
<title>AI Portfolio on Huggingface</title>
|
7 |
<link rel="stylesheet" href="style.css" />
|
8 |
+
<script>
|
9 |
+
document.addEventListener("DOMContentLoaded", function () {
|
10 |
+
fetch("projects.json")
|
11 |
+
.then((response) => response.json())
|
12 |
+
.then((data) => {
|
13 |
+
const projects = data;
|
14 |
+
const projectsContainer = document.getElementById("projects");
|
15 |
+
projects.forEach((project) => {
|
16 |
+
const projectCard = document.createElement("section");
|
17 |
+
projectCard.classList.add("project-card");
|
18 |
+
const title = document.createElement("h2");
|
19 |
+
title.textContent = project.title;
|
20 |
+
const description = document.createElement("p");
|
21 |
+
description.textContent = project.description;
|
22 |
+
const link = document.createElement("a");
|
23 |
+
link.href = project.link;
|
24 |
+
link.target = "_blank";
|
25 |
+
link.textContent = "View Project";
|
26 |
+
|
27 |
+
projectCard.appendChild(title);
|
28 |
+
projectCard.appendChild(description);
|
29 |
+
projectCard.appendChild(link);
|
30 |
+
|
31 |
+
projectsContainer.appendChild(projectCard);
|
32 |
+
});
|
33 |
+
|
34 |
+
});
|
35 |
+
});
|
36 |
+
|
37 |
+
|
38 |
+
</script>
|
39 |
</head>
|
40 |
<body>
|
41 |
+
<header>
|
42 |
+
<h1>Welcome to My AI Portfolio</h1>
|
43 |
+
<p>Discover my projects and experiments with Artificial Intelligence on Huggingface.</p>
|
44 |
+
</header>
|
45 |
+
<article id="projects">
|
46 |
+
</article>
|
47 |
+
<footer>
|
48 |
+
<a href="https://www.linkedin.com/in/sebdg/" target="_blank">
|
49 |
+
<img src="linkedin.png" width="24" alt="LinkedIn Icon" /> My LinkedIn Profile
|
50 |
+
</a>
|
51 |
+
</footer>
|
52 |
</body>
|
53 |
+
</html>
|
linkedin.png
ADDED
![]() |
Git LFS Details
|
object-detection.html
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<meta name="viewport" content="width=device-width" />
|
6 |
+
<title>Object Detection with YOLO</title>
|
7 |
+
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
8 |
+
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
9 |
+
<link rel="stylesheet" href="style.css" />
|
10 |
+
<link
|
11 |
+
rel="stylesheet"
|
12 |
+
href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release/build/styles/default.min.css"
|
13 |
+
/>
|
14 |
+
<script src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release/build/highlight.min.js"></script>
|
15 |
+
<script>
|
16 |
+
class BC {
|
17 |
+
constructor(elementId) {
|
18 |
+
this.container = document.getElementById(elementId);
|
19 |
+
this.headings = document.querySelectorAll("h1, h2, h3, h4");
|
20 |
+
this.currentHeading = null;
|
21 |
+
}
|
22 |
+
set_breadcrumb() {
|
23 |
+
const headings = document.querySelectorAll("h1, h2, h3, h4"); // Select all heading elements
|
24 |
+
let currentHeading = null;
|
25 |
+
|
26 |
+
// Iterate through headings to see which is currently viewable
|
27 |
+
for (let i = 0; i < headings.length; i++) {
|
28 |
+
const heading = headings[i];
|
29 |
+
if (
|
30 |
+
heading.getBoundingClientRect().top <
|
31 |
+
window.innerHeight * 0.1
|
32 |
+
) {
|
33 |
+
// Heading is at the top of the page
|
34 |
+
currentHeading = heading;
|
35 |
+
} else {
|
36 |
+
break; // Once a heading below the top is found, stop the search
|
37 |
+
}
|
38 |
+
}
|
39 |
+
|
40 |
+
// Update the breadcrumb div with the current heading information
|
41 |
+
const breadcrumb = document.getElementById("breadcrumb");
|
42 |
+
if (currentHeading) {
|
43 |
+
breadcrumb.textContent = currentHeading.textContent; // Set text or build a more complex breadcrumb
|
44 |
+
}
|
45 |
+
}
|
46 |
+
}
|
47 |
+
console.log(marked);
|
48 |
+
document.addEventListener("DOMContentLoaded", function () {
|
49 |
+
fetch("object-detection.md")
|
50 |
+
.then((response) => response.text())
|
51 |
+
.then((text) => {
|
52 |
+
const html = marked.marked(text);
|
53 |
+
document.getElementById("markdown-container").innerHTML = html;
|
54 |
+
document.querySelectorAll("pre code").forEach((block) => {
|
55 |
+
hljs.highlightBlock(block);
|
56 |
+
});
|
57 |
+
const bc = new BC("markdown-container");
|
58 |
+
bc.set_breadcrumb();
|
59 |
+
document.addEventListener("scroll", bc.set_breadcrumb);
|
60 |
+
})
|
61 |
+
.catch((error) =>
|
62 |
+
console.error("Error loading the Markdown file:", error)
|
63 |
+
);
|
64 |
+
});
|
65 |
+
</script>
|
66 |
+
</head>
|
67 |
+
<body>
|
68 |
+
<div id="breadcrumb"></div>
|
69 |
+
<div id="markdown-container"></div>
|
70 |
+
</body>
|
71 |
+
</html>
|
object-detection.md
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Object Detection with YOLO
|
2 |
+
|
3 |
+
## Introduction
|
4 |
+
|
5 |
+
In the realm of AI, the rapid advancement of technology, computer-vision has paved the way for innovations that could significantly enhance automated systems
|
6 |
+
and improve public safety, especially in public transportation.
|
7 |
+
|
8 |
+
Among the myriad of object detection systems and models, [Yolo](https://docs.ultralytics.com/) (You Only Look Once) stands out due to its unique ability to detect objects in real-time with remarkable accuracy and low computing requirements. The small size of the models makes them optimal for deployment and use in on-board equiment with limited resouces. While remaining very performant and a good fit in dynamic environments where rapid decision-making is crucial, such as in public transportation and autonomous driving.
|
9 |
+
|
10 |
+
This experiment, led by a job openning I was applying to delves into the application of the YOLO object detection model on cab ride videos from trains and trams. These videos, were captured directly from the front-facing cameras of public transit vehicles and posted on Youtube, offer a rich dataset that reflects the diverse and unpredictable urban environment through which these vehicles travel.
|
11 |
+
|
12 |
+
The primary goal of utilizing such a model in this context is to enhance transportation safety and operational efficiency by identifying potential hazards and improving route management based on real-world data. Other applications could be related to line-optimizations, passenger counting, track condition analysis and of course full autonomous driving.
|
13 |
+
|
14 |
+
While exploring the process of training such a YOLO model with these specific videos, I wanted to document this experiment to highlight the challenges faced during the model's adaptation to the peculiarities of railway and tram systems. While keeping the dataset and effort rather limited I want to demonstrate my amazing skills, humm... lol, not really, but demonstrate how such rather complex tasks can be done with a rather limited set of human annotations and intervention using foundational models to train very specialized ones.
|
15 |
+
|
16 |
+
## Methodology
|
17 |
+
|
18 |
+
The methodology used to train the vision model using cab ride videos encompasses several critical steps, from data collection to model training and validation.
|
19 |
+
Each step is vital, and it's quality will influence the model's accuracy and functionality in real-world scenarios.
|
20 |
+
This is _just_ an experiment and a real-world model would require more data, quality assurance and intermediate models or specific models for sub-tasks.
|
21 |
+
|
22 |
+
### Data Collection
|
23 |
+
|
24 |
+
The primary data for this project consists of YouTube cab ride videos recorded in trains and trams. These videos are typically captured through front-facing cameras mounted on the vehicles, providing a driver's-eye view of the route. These footage includes diverse scenes from urban and rural settings under various weather and lighting conditions making them good source of input data.
|
25 |
+
|
26 |
+
### Data Characteristics
|
27 |
+
|
28 |
+
The videos are characterized by high-resolution imagery that captures details necessary for accurate object detection, such as obstacles on tracks, signals, and other vehicles. The collection spans a couple of hours of footage, ensuring a comprehensive dataset that includes a wide range of scenarios and anomalies. These videos are often very lengthy and are including stops, tunnels or portions of very similar frames. To select which video segments are interesting and offer a variety of situation as a first step we will split them into segments.
|
29 |
+
|
30 |
+
```python
|
31 |
+
def split_video(video_path, output_folder, segment_length=120):
|
32 |
+
"""Split a video file into segments of a fixed length and save them as separate files."""
|
33 |
+
# Load the video
|
34 |
+
video = VideoFileClip(video_path)
|
35 |
+
duration = video.duration
|
36 |
+
|
37 |
+
# create the output folder
|
38 |
+
if not os.path.exists(output_folder):
|
39 |
+
os.makedirs(output_folder)
|
40 |
+
|
41 |
+
# Calculate the number of segments needed
|
42 |
+
num_segments = math.ceil(duration / segment_length)
|
43 |
+
print(f"Splitting the video into {num_segments} segments.")
|
44 |
+
# Loop through all segments
|
45 |
+
for i in range(num_segments):
|
46 |
+
# Calculate start and end times
|
47 |
+
start_time = i * segment_length
|
48 |
+
end_time = min((i + 1) * segment_length, duration)
|
49 |
+
|
50 |
+
# Cut the segment
|
51 |
+
segment = video.subclip(start_time, end_time)
|
52 |
+
|
53 |
+
video_name = os.path.basename(video_path)
|
54 |
+
|
55 |
+
# Define the output file name
|
56 |
+
output_filename = f"{output_folder}/{video_name[:-4]}_{i+1}.mp4"
|
57 |
+
|
58 |
+
# Write the segment to a file
|
59 |
+
segment.write_videofile(output_filename, codec='libx264', audio_codec='aac')
|
60 |
+
|
61 |
+
print(f"Segment {i+1} is done.")
|
62 |
+
```
|
63 |
+
|
64 |
+
### Preparation and Preprocessing
|
65 |
+
|
66 |
+
Now that we have a set of videos of 2 minutes each lets only focus on those having relevant content, a couple of techniques to identify frame similarity came to my mind but they would not highlight sequences having different objects or situation. So I quickly shuffled through the videos manually and kept about 15 miinutes of footage to analyse from the 7 hours of initial footprint, covering urban and rural situation, trams and trains, with or without other vehicules or pedestrians.
|
67 |
+
|
68 |
+
#### Frame extraction
|
69 |
+
|
70 |
+
Due to the continuous nature of video files, the next step involves extracting _some_ frames at a fixed interval (strides). This
|
71 |
+
process reduces the volume of data to a manageable size for annotation and training.
|
72 |
+
|
73 |
+
```python
|
74 |
+
|
75 |
+
def extract_frames(video_path, output_folder, stride=12):
|
76 |
+
"""Extract frames from a video file and save them as PNG images."""
|
77 |
+
# load the video
|
78 |
+
cap = cv2.VideoCapture(video_path)
|
79 |
+
if not cap.isOpened():
|
80 |
+
print("Error: could not open the video.")
|
81 |
+
return
|
82 |
+
|
83 |
+
# create the output folder
|
84 |
+
if not os.path.exists(output_folder):
|
85 |
+
os.makedirs(output_folder)
|
86 |
+
|
87 |
+
# extract frames
|
88 |
+
frame_count = 0
|
89 |
+
while True:
|
90 |
+
ret, frame = cap.read()
|
91 |
+
if not ret:
|
92 |
+
break
|
93 |
+
frame_count += 1
|
94 |
+
if frame_count % stride != 0:
|
95 |
+
continue
|
96 |
+
frame_path = os.path.join(output_folder, f"{frame_count:06d}.png")
|
97 |
+
cv2.imwrite(frame_path, frame)
|
98 |
+
|
99 |
+
print(f"Extracted {frame_count} frames to {output_folder}")
|
100 |
+
|
101 |
+
```
|
102 |
+
|
103 |
+
#### Annotation Process
|
104 |
+
|
105 |
+

|
106 |
+
|
107 |
+
From the 15minutes I've selected as less as 170 frames again creating small but comprehensive set of situations and conditions to be labelled.
|
108 |
+
Each of these frames is then manually annotated by a team of trained annotators (Me, Myself and I) using label-studio. This involves identifying and labeling various objects of interest, such as pedestrians, vehicles, signals, signs, rails etc. The annotations are exported in the YOLO format, which includes bounding boxes and object class labels.
|
109 |
+
|
110 |
+
```html
|
111 |
+
<View>
|
112 |
+
<image name="image" value="$image" />
|
113 |
+
<RectangleLabels name="label" toName="image">
|
114 |
+
<label value="TrafficLight" background="#FFA39E" />
|
115 |
+
<label value="Train" background="#FFC069" />
|
116 |
+
<label value="CargoWagon" background="#AD8B00" />
|
117 |
+
<label value="Sign" background="#a09eff" />
|
118 |
+
<label value="Pedestrian" background="#cf0c02" />
|
119 |
+
<label value="Car" background="#0dd311" />
|
120 |
+
<label value="Bike" background="#fb8313" />
|
121 |
+
<label value="Tram" background="#FFA39E" />
|
122 |
+
<label value="Bus" background="#D4380D" />
|
123 |
+
</RectangleLabels>
|
124 |
+
</View>
|
125 |
+
```
|
126 |
+
|
127 |
+
## Model Training
|
128 |
+
|
129 |
+
The YOLOv8n (nano) model is selected for this project due to its balance of speed and accuracy, making it suitable for real-time detection tasks.
|
130 |
+
YOLOv8 is known for its improved performance over previous versions through enhancements in architecture and training techniques.
|
131 |
+
|
132 |
+
### Dataset Configuration
|
133 |
+
|
134 |
+
```yaml
|
135 |
+
path: ../object-detection/datasets
|
136 |
+
train: detection
|
137 |
+
val: detection
|
138 |
+
names:
|
139 |
+
0: Bike
|
140 |
+
1: Bus
|
141 |
+
2: Car
|
142 |
+
3: CargoWagon
|
143 |
+
4: Pedestrian
|
144 |
+
5: Sign
|
145 |
+
6: TrafficLight
|
146 |
+
7: Train
|
147 |
+
8: Tram
|
148 |
+
```
|
149 |
+
|
150 |
+
### Training Process
|
151 |
+
|
152 |
+
The training process involves feeding the annotated frames into the YOLO model. Data augmentation techniques such as rotation, scaling, and color
|
153 |
+
adjustment are employed to improve the model’s robustness by simulating various operational scenarios. The model undergoes several iterations of
|
154 |
+
training and validation cycles to minimize overfitting and enhance its generalization capabilities.
|
155 |
+
|
156 |
+
```bash
|
157 |
+
yolo.exe train detect data=trainz.detect.yaml model=yolov8n.pt
|
158 |
+
```
|
159 |
+
|
160 |
+
### Validation and Testing
|
161 |
+
|
162 |
+
Post-training, the model is validated using a separate set of video frames that were not included in the training set. This step is crucial to evaluate the model's performance and accuracy in detecting objects under different conditions.
|
163 |
+
|
164 |
+
```bash
|
165 |
+
Validating runs\detection\weights\best.pt...
|
166 |
+
Ultralytics YOLOv8.1.47 🚀 Python-3.11.9 torch-2.2.2+cpu CPU
|
167 |
+
Model summary (fused): 168 layers, 3007403 parameters, 0 gradients, 8.1 GFLOPs
|
168 |
+
Class Images Instances Box(P R mAP50 mAP50-95): 100%|██| 6/6 [2.24s/it]
|
169 |
+
all 177 1011 0.955 0.876 0.931 0.755
|
170 |
+
Bike 177 4 1 0.644 0.764 0.705
|
171 |
+
Bus 177 11 1 0.886 0.995 0.895
|
172 |
+
Car 177 389 0.943 0.925 0.975 0.761
|
173 |
+
CargoWagon 177 33 0.869 0.848 0.865 0.677
|
174 |
+
Pedestrian 177 149 0.901 0.94 0.963 0.734
|
175 |
+
Sign 177 212 0.949 0.789 0.898 0.604
|
176 |
+
TrafficLight 177 157 0.964 0.924 0.969 0.701
|
177 |
+
Train 177 34 0.992 0.971 0.975 0.856
|
178 |
+
Tram 177 22 0.976 0.955 0.978 0.862
|
179 |
+
Speed: 0.9ms preprocess, 55.5ms inference, 0.0ms loss, 0.7ms postprocess per image
|
180 |
+
Results saved to runs\detection\
|
181 |
+
```
|
182 |
+
|
183 |
+
### Performance Metrics
|
184 |
+
|
185 |
+
The effectiveness of the trained model is measured using standard metrics such as precision, recall, and Intersection over Union (IoU).
|
186 |
+
In the validation and testing phase of training a YOLO vision model, it is essential to measure its performance to ensure it can reliably identify objects under various conditions. Two critical metrics used for this purpose are precision and recall. These metrics provide insight into the model's accuracy and its ability to detect all relevant objects within the video frames. Note the performance of the model without any futher optimization or attention, the 55ms of inference time, reaching ~20 frames per second without any GPU assistance is a very good performance, futher parameter fine-tuning or resolution reductions could make significant differences but are beyond the scope of this POC.
|
187 |
+
|
188 |
+

|
189 |
+
|
190 |
+
**Precision** (or positive predictive value) measures the accuracy of the detections made by the model. In the context of the YOLO vision model for cab ride videos, precision reflects the proportion of correct positive detections out of all positive detections made. For example, if the model identifies 100 objects as vehicles and 90 of these identifications are correct, the precision is 90%. High precision is crucial in transportation settings to minimize false alarms, which can lead to unnecessary disruptions or desensitization to alerts.
|
191 |
+
|
192 |
+
**Recall** (or sensitivity) measures the model's ability to find all the relevant cases (or objects) within the dataset. In terms of the project, recall assesses the proportion of actual objects in the video frames that the model successfully detects. For instance, if there are 100 actual vehicles in the video and the model correctly identifies 85 of them, the recall is 85%. High recall is particularly important in safety-critical applications like transportation to ensure that potential hazards are not overlooked.
|
193 |
+
|
194 |
+
Both metrics are especially important because they help balance the model's performance. A high precision rate with a low recall rate might indicate that the model is too conservative, missing potential hazards. Conversely, a high recall rate with low precision might mean the model generates many false positives, which could reduce the trust in or efficiency of the system. Therefore, tuning the model to achieve a balanced trade-off between precision and recall is vital for practical deployment in public transportation monitoring systems.
|
195 |
+
|
196 |
+
**Intersection over Union** (IoU) is another metric used alongside precision and recall. It measures the overlap between the predicted bounding box and the actual bounding box, providing a direct measurement of localization accuracy, which is essential for precise object detection in dynamic environments like those captured in train and tram cab ride videos.
|
197 |
+
|
198 |
+
## From Object Detection to Segmentation
|
199 |
+
|
200 |
+
Meta's SAM (Segment-Anything Model) provides a powerful tool for generating segmentation datasets using an initial set of detection data. This is particularly useful for situations where you have a dataset labeled for object detection and you want to extend it to include segmentation labels, which are typically more detailed and involve classifying each pixel of the object.
|
201 |
+
|
202 |
+

|
203 |
+
|
204 |
+
### Extending Detection Models to Generate a Segmentation Dataset
|
205 |
+
|
206 |
+
Building upon the foundation laid by the initial object detection model, this project took a significant step forward by employing Meta's Segment-Anything Model (SAM) to enhance our dataset with segmentation labels. The integration of SAM into our methodology allowed us to transform standard detection outputs—specifically, bounding boxes—into detailed pixel-level segmentation maps. This process bridged the gap between detection and segmentation, providing a comprehensive understanding of each object's precise contours and boundaries within the urban transit environment captured in our cab ride videos.
|
207 |
+
|
208 |
+

|
209 |
+
|
210 |
+
### Integration of SAM with Detection Outputs
|
211 |
+
|
212 |
+
Initially, our project utilized a robust detection model trained to identify various objects, such as vehicles, pedestrians, and other significant elements, within the urban landscape. The detection model efficiently located these objects and outlined them with bounding boxes. The transition from detection to segmentation began by feeding these bounding box coordinates into SAM. SAM's sophisticated algorithms were then applied to precisely delineate the shapes enclosed within these boxes, focusing on the texture, color, and form contrasts between the objects and their backgrounds.
|
213 |
+
|
214 |
+

|
215 |
+
|
216 |
+
### Creating a Rich Segmentation Dataset
|
217 |
+
|
218 |
+
The result of this integration was a series of high-quality segmentation masks that corresponded to each detected object. These masks detailed the objects at a pixel level, thus providing a far more nuanced dataset than was originally available with mere detection labels. To compile this enriched dataset, each original image was paired with its newly generated segmentation mask. This pairing formed a comprehensive set of data that included both the original detection information and the advanced segmentation details.
|
219 |
+
|
220 |
+
```python
|
221 |
+
from ultralytics.data.annotator import auto_annotate
|
222 |
+
auto_annotate(
|
223 |
+
data="datasets\\track-detection",
|
224 |
+
det_model="runs\\detection\\weights\\best.pt",
|
225 |
+
sam_model='sam_b.pt',
|
226 |
+
output_dir="datasets\\autosegment")
|
227 |
+
```
|
228 |
+
|
229 |
+
### Quality Assurance and Dataset Refinement
|
230 |
+
|
231 |
+
Critical to this methodology is the quality assurance phase. Each generated segmentation mask should undergo a thorough review to ensure that it meets the project’s standards for accuracy and consistency. This step is essential but much less time and resource consuming than manual annotation. The precision of segmentation masks will directly influence the effectiveness of subsequent models trained using this data. Where discrepancies or inaccuracies were noted, adjustments should be done through manual corrections to the masks, ensuring that the dataset upholds the integrity required for advanced computer vision applications.
|
232 |
+
|
233 |
+
### Utilization for Advanced Model Training
|
234 |
+
|
235 |
+
The enriched segmentation dataset prepared in this manner is not merely an exercise but a practical toolkit for further research and development. With these detailed segmentation maps, we could train more sophisticated models capable of performing complex tasks that rely on an intricate understanding of the spatial and textural context of objects within an image. Those annotated masks can now be used to help the annotation of futher data or crop/hide parts of the frames for different subtask processing. Such tasks include object tracking, distance estimation, obstacle detection, signs reading, signal interpretation. All these tasks might require different specialized models and can varing performance requirements therefore the initial segmentation and mask generation of these from the live images at low cost is essential.
|
236 |
+
|
237 |
+
## Conclusion
|
238 |
+
|
239 |
+
This exploration into the use of YOLO for object detection on cab ride videos has revealed the significant potential of AI in public transportation. The successful application of YOLOv8n demonstrates not just a technological triumph but also a blueprint for future innovations in autonomous navigation and safety enhancements. By creatively leveraging YouTube videos as a data source and employing Meta's SAM for segmentation, I have shown that even with constrained resources, and very limited amount of annotated data, one can generate a dataset rich enough to train a sophisticated model.
|
240 |
+
|
241 |
+
My takeaways from this experience include:
|
242 |
+
|
243 |
+
* The feasibility of applying advanced AI models like YOLO to real-world situations with limited data.
|
244 |
+
* The importance of precision and recall balance in model performance, particularly in safety-critical applications.
|
245 |
+
* The versatility of YOLO, which extends beyond detection to enable comprehensive scene understanding through segmentation.
|
246 |
+
* The power of leveraging large compute and resource intensive models to train small lightweight specialized models.
|
247 |
+
|
248 |
+
This work paves the way for more intricate applications and sets the stage for further refinement and application of AI in public transportation, promising a future where safety and efficiency are greatly enhanced by intelligent systems.
|
projects.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"id": "1",
|
4 |
+
"title": "Object Detection with YOLO",
|
5 |
+
"description": ["This project is about object detection using YOLO algorithm.",
|
6 |
+
"YOLO is a state-of-the-art, real-time object detection system."],
|
7 |
+
"skills": ["Python", "OpenCV", "YOLO"],
|
8 |
+
"tasks": ["object-detection", "image-segmentation", "real-time-processing", "computer-vision"],
|
9 |
+
"link": "object-detection.html"
|
10 |
+
}
|
11 |
+
]
|
style.css
CHANGED
@@ -1,28 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
body {
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
}
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
}
|
10 |
|
11 |
-
|
12 |
-
color:
|
13 |
-
|
14 |
-
margin-bottom: 10px;
|
15 |
-
margin-top: 5px;
|
16 |
}
|
17 |
|
18 |
-
.card {
|
19 |
-
|
20 |
-
margin:
|
21 |
-
padding: 16px;
|
22 |
-
border: 1px solid lightgray;
|
23 |
-
border-radius: 16px;
|
24 |
}
|
25 |
|
26 |
-
.card
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
}
|
|
|
1 |
+
|
2 |
+
/* HTML5 display-role reset for older browsers */
|
3 |
+
article, aside, details, figcaption, figure,
|
4 |
+
footer, header, hgroup, menu, nav, section {
|
5 |
+
display: block;
|
6 |
+
}
|
7 |
+
|
8 |
body {
|
9 |
+
margin: 0; /* Remove default margin */
|
10 |
+
padding: 0; /* Remove default padding */
|
11 |
+
line-height: 1;
|
12 |
+
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
|
13 |
+
background-color: #f4f4f4; /* Light grey background */
|
14 |
+
color: #333; /* Main text color */
|
15 |
+
}
|
16 |
+
|
17 |
+
a {
|
18 |
+
color: #067df7; /* Huggingface blue for links */
|
19 |
+
text-decoration: none; /* No underlines on links */
|
20 |
+
}
|
21 |
+
|
22 |
+
a:hover {
|
23 |
+
text-decoration: underline; /* Underline on hover for links */
|
24 |
+
}
|
25 |
+
|
26 |
+
header {
|
27 |
+
background-color: #fff; /* White background for the header */
|
28 |
+
padding: 2em; /* Padding around header content */
|
29 |
+
text-align: center; /* Centered header text */
|
30 |
+
border-bottom: 2px solid #eaeaea; /* Light grey border at the bottom */
|
31 |
+
}
|
32 |
+
|
33 |
+
header h1 {
|
34 |
+
color: #333; /* Dark grey color for the main title */
|
35 |
+
font-size: 2.5rem; /* Larger font size for the main title */
|
36 |
+
margin-bottom: 0.5em; /* Spacing below the main title */
|
37 |
+
}
|
38 |
+
|
39 |
+
header p {
|
40 |
+
color: #666; /* Medium grey for the subtitle */
|
41 |
+
font-size: 1.2rem; /* Subtitle size */
|
42 |
+
margin-top: 0; /* Align top of subtitle with title's bottom */
|
43 |
+
}
|
44 |
+
|
45 |
+
footer {
|
46 |
+
background-color: #fff; /* White background for the footer */
|
47 |
+
padding: 1em; /* Padding around footer content */
|
48 |
+
text-align: center; /* Centered footer text */
|
49 |
+
border-top: 2px solid #eaeaea; /* Light grey border at the top */
|
50 |
+
font-size: 0.9rem; /* Smaller font size for footer */
|
51 |
+
}
|
52 |
+
|
53 |
+
footer img {
|
54 |
+
vertical-align: middle; /* Align images with text */
|
55 |
+
margin-right: 0.5em; /* Space between icon and text */
|
56 |
+
}
|
57 |
+
|
58 |
+
footer a {
|
59 |
+
color: #067df7; /* Consistent link color */
|
60 |
+
}
|
61 |
+
|
62 |
+
.project-card {
|
63 |
+
background-color: #fff; /* White background for projects */
|
64 |
+
margin: 1em; /* Margin around cards */
|
65 |
+
padding: 2em; /* Padding inside cards */
|
66 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
|
67 |
+
border-radius: 10px; /* Rounded corners for cards */
|
68 |
+
transition: transform 0.3s ease, box-shadow 0.3s ease; /* Smooth transitions for hover effects */
|
69 |
}
|
70 |
|
71 |
+
.project-card:hover {
|
72 |
+
transform: translateY(-5px); /* Slight lift effect on hover */
|
73 |
+
box-shadow: 0 10px 15px rgba(0, 0, 0, 0.15); /* Enhanced shadow on hover */
|
74 |
}
|
75 |
|
76 |
+
.project-card h2 {
|
77 |
+
color: #333; /* Dark grey for titles */
|
78 |
+
margin-bottom: 0.8em; /* Space below title */
|
|
|
|
|
79 |
}
|
80 |
|
81 |
+
.project-card p {
|
82 |
+
color: #666; /* Medium grey for descriptions */
|
83 |
+
margin-bottom: 1em; /* Space below description */
|
|
|
|
|
|
|
84 |
}
|
85 |
|
86 |
+
.project-card a {
|
87 |
+
font-weight: bold; /* Make "View Project" links bold */
|
88 |
+
color: #067df7; /* Use blue for call to action */
|
89 |
+
padding: 0.5em 1em; /* Padding for clickable area */
|
90 |
+
border: 2px solid #067df7; /* Border matching the text color */
|
91 |
+
border-radius: 5px; /* Rounded corners for buttons */
|
92 |
+
display: inline-block; /* Allow for padding and border */
|
93 |
+
}
|
94 |
+
|
95 |
+
.project-card a:hover {
|
96 |
+
background-color: #067df7; /* Background color on hover */
|
97 |
+
color: #fff; /* Text color on hover */
|
98 |
+
}
|
99 |
+
/* Breadcrumb styling */
|
100 |
+
#breadcrumb {
|
101 |
+
padding: 0.5em 1em;
|
102 |
+
background-color: #f9f9f9; /* Light grey background to stand out */
|
103 |
+
border-left: 3px solid #fae901; /* Blue accent line */
|
104 |
+
color: #333; /* Dark text for readability */
|
105 |
+
font-size: 0.9rem; /* Smaller font size for the breadcrumb */
|
106 |
+
margin-bottom: 1em; /* Space before the main content */
|
107 |
+
position: sticky; /* Stick to the top when scrolling */
|
108 |
+
top: 0;
|
109 |
+
z-index: 10; /* Ensure it's above other content */
|
110 |
+
}
|
111 |
+
|
112 |
+
/* Markdown container styling for better reading experience */
|
113 |
+
#markdown-container {
|
114 |
+
padding: 1em; /* Padding around the text */
|
115 |
+
background-color: #fff; /* White background for reading */
|
116 |
+
border-radius: 5px; /* Slightly rounded corners */
|
117 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
|
118 |
+
max-width: 800px; /* Max width to maintain optimal line length for reading */
|
119 |
+
margin: 1em auto; /* Center the container and add margin around it */
|
120 |
+
word-wrap: break-word; /* Ensure long words don't overflow */
|
121 |
+
}
|
122 |
+
|
123 |
+
/* Style the Table of Contents */
|
124 |
+
#toc {
|
125 |
+
padding: 1em; /* Padding around the content */
|
126 |
+
background-color: #fff; /* White background */
|
127 |
+
border-radius: 5px; /* Slightly rounded corners */
|
128 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
|
129 |
+
max-width: 800px; /* Max width to maintain optimal line length for reading */
|
130 |
+
margin: 1em auto; /* Center the container and add margin around it */
|
131 |
+
list-style: none; /* Remove default list styling */
|
132 |
+
}
|
133 |
+
|
134 |
+
#toc li a {
|
135 |
+
color: #067df7; /* Blue color for links to match the theme */
|
136 |
+
text-decoration: none; /* No underline */
|
137 |
+
display: block; /* Block level to add padding */
|
138 |
+
padding: 0.2em 0; /* Padding for each link */
|
139 |
+
}
|
140 |
+
|
141 |
+
#toc li a:hover {
|
142 |
+
background-color: #f0f0f0; /* Light grey background on hover */
|
143 |
+
border-radius: 3px; /* Slight rounding on hover */
|
144 |
+
}
|
145 |
+
|
146 |
+
/* Enhance the appearance of code blocks */
|
147 |
+
pre code {
|
148 |
+
display: block;
|
149 |
+
padding: 1em; /* Padding inside code blocks */
|
150 |
+
background-color: #f0f0f0; /* Light grey background for code blocks */
|
151 |
+
border-radius: 5px; /* Rounded corners for code blocks */
|
152 |
+
overflow-x: auto; /* Enable horizontal scrolling if the code is too wide */
|
153 |
+
}
|
154 |
+
|
155 |
+
/* Make images responsive */
|
156 |
+
img {
|
157 |
+
max-width: 100%; /* Make images responsive */
|
158 |
+
height: auto; /* Adjust height automatically */
|
159 |
+
display: block; /* Images are block level to apply max-width */
|
160 |
+
margin: 1em 0; /* Margin above and below images */
|
161 |
+
}
|
162 |
+
|
163 |
+
/* Responsive design for smaller screens */
|
164 |
+
@media (max-width: 768px) {
|
165 |
+
#markdown-container,
|
166 |
+
#toc {
|
167 |
+
padding: 0.5em; /* Smaller padding on small screens */
|
168 |
+
margin: 0.5em; /* Smaller margin on small screens */
|
169 |
+
}
|
170 |
+
|
171 |
+
header h1 {
|
172 |
+
font-size: 2rem; /* Slightly smaller font for smaller screens */
|
173 |
+
}
|
174 |
+
|
175 |
+
header p {
|
176 |
+
font-size: 1rem; /* Smaller subtitle on smaller screens */
|
177 |
+
}
|
178 |
+
|
179 |
+
.project-card {
|
180 |
+
margin: 0.5em;
|
181 |
+
}
|
182 |
}
|