Compare commits
4 Commits
master
...
tiff-threa
Author | SHA1 | Date |
---|---|---|
Cyberes | 557985774d | |
Cyberes | 1d8b45fae4 | |
Cyberes | 65953c9bde | |
Cyberes | 664eb1a52f |
19
README.md
19
README.md
|
@ -2,13 +2,19 @@
|
|||
|
||||
_Scrape tiles from WMTS servers._
|
||||
|
||||
You know what I hate? Those godforsaken WMTS servers, perched on their digital thrones, acting like they're the TILE TYRANTS of the universe. They think they can just LOCK UP their precious little tiles and keep me from doing my THING? HA!
|
||||
You know what I hate? Those godforsaken WMTS servers, perched on their digital thrones, acting like they're the TILE
|
||||
TYRANTS of the universe. They think they can just LOCK UP their precious little tiles and keep me from doing my THING?
|
||||
HA!
|
||||
|
||||
No more will these WMTS servers shroud their CRAPPY-ASS tiles in mystery. I'm coming for your DATA, you binary BASTARDS, and there's not a SINGLE 1 or 0 you can throw at me that will stop my CHARGE.
|
||||
No more will these WMTS servers shroud their CRAPPY-ASS tiles in mystery. I'm coming for your DATA, you binary BASTARDS,
|
||||
and there's not a SINGLE 1 or 0 you can throw at me that will stop my CHARGE.
|
||||
|
||||
You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAIN. I'll slice through your defenses like a HOT PIZZA through COLD BUTTER. I'll have your DATA, and there's absolutely NOTHING, I repeat, NOTHING you can do to STOP ME.
|
||||
You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAIN. I'll slice through your defenses like
|
||||
a HOT PIZZA through COLD BUTTER. I'll have your DATA, and there's absolutely NOTHING, I repeat, NOTHING you can do to
|
||||
STOP ME.
|
||||
|
||||
So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm bringing a whole lot of CHAOS with me.
|
||||
So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm
|
||||
bringing a whole lot of CHAOS with me.
|
||||
|
||||
### Install
|
||||
|
||||
|
@ -29,9 +35,12 @@ python3 exfiltrate.py \
|
|||
--threads 30
|
||||
```
|
||||
|
||||
Building the GeoTIFF will take dozens of gigs of memory for any significant extent! For example, a 21 mile extent
|
||||
required about 400GB of memory. You can use swap for this, but don't expect it to be very quick if you go this route.
|
||||
|
||||
### ArcGIS
|
||||
|
||||
???
|
||||
TODO
|
||||
|
||||
### Credits
|
||||
|
||||
|
|
143
exfiltrate.py
143
exfiltrate.py
|
@ -1,8 +1,9 @@
|
|||
import argparse
|
||||
import base64
|
||||
import time
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from queue import Queue
|
||||
|
||||
import numpy as np
|
||||
import rasterio
|
||||
|
@ -11,7 +12,7 @@ from rasterio import Affine
|
|||
from tqdm import tqdm
|
||||
|
||||
from pkg.image import random_file_width
|
||||
from pkg.spatial import deg2num, lonlat_to_meters
|
||||
from pkg.spatial import deg2num
|
||||
from pkg.thread import download_tile
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -22,9 +23,11 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--referer', help='The content of the Referer header to send.')
|
||||
parser.add_argument('--output', default='wmts-output', help='Output directory path.')
|
||||
parser.add_argument('--proxy', action='store_true', help='Enable using a proxy.')
|
||||
parser.add_argument('--tiff-threads', default=None, help='Number of threads to use when building TIFF. Default: auto')
|
||||
parser.add_argument('--tiff-threads', default=10, type=int, help='Number of threads to use when building TIFF. Default: auto')
|
||||
parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff')
|
||||
parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)')
|
||||
# parser.add_argument('--extent', default=None, help='Specify an extent to break the output image to. This is the diagonal.')
|
||||
parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.base_url = args.base_url.strip('/') + f'/{args.zoom}/'
|
||||
|
@ -59,9 +62,14 @@ if __name__ == '__main__':
|
|||
for row in row_iter:
|
||||
row_i = row
|
||||
col_iter = range(min_col, max_col + 1)
|
||||
col_bar = tqdm(total=len(col_iter), leave=False)
|
||||
|
||||
# if args.no_download:
|
||||
# for col in col_iter:
|
||||
# tiles.append((row, col))
|
||||
# else:
|
||||
with (ThreadPoolExecutor(args.threads) as executor):
|
||||
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter]
|
||||
col_bar = tqdm(total=len(col_iter), leave=False)
|
||||
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy, args.no_download)) for col in col_iter]
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
if result:
|
||||
|
@ -76,8 +84,8 @@ if __name__ == '__main__':
|
|||
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
|
||||
col_bar.update()
|
||||
row_bar.refresh()
|
||||
col_bar.close()
|
||||
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
|
||||
col_bar.close()
|
||||
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
|
||||
row_bar.update()
|
||||
row_bar.close()
|
||||
|
||||
|
@ -98,66 +106,79 @@ if __name__ == '__main__':
|
|||
|
||||
print(f'Downloaded {total_downloaded} images.')
|
||||
|
||||
print('Preparing data...')
|
||||
|
||||
tile_size = random_file_width(tiles_output)
|
||||
|
||||
# Define the number of rows and columns based on the bounding box
|
||||
num_rows = max_row - min_row + 1
|
||||
num_cols = max_col - min_col + 1
|
||||
|
||||
# Create an empty array to store the image data
|
||||
image_data = np.empty((num_rows * tile_size, num_cols * tile_size, 3), dtype=np.uint8)
|
||||
|
||||
|
||||
def build_tiff_data(task):
|
||||
row, col = task
|
||||
tile_file = tiles_output / f"{row}_{col}.png"
|
||||
if not tile_file.is_file():
|
||||
raise Exception(f'Tile does not exist: {tile_file}')
|
||||
|
||||
with Image.open(tile_file) as img:
|
||||
tile_data = np.array(img)
|
||||
|
||||
# Remove the alpha channel
|
||||
tile_data = tile_data[:, :, :3]
|
||||
|
||||
# Replace white pixels with NODATA
|
||||
tile_data[np.all(tile_data == [255, 255, 255], axis=-1)] = [0, 0, 0]
|
||||
|
||||
# ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1).
|
||||
mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero.
|
||||
for i in range(3): # Iterate over each band.
|
||||
# For these pixels, set zero bands to one.
|
||||
tile_data[mask & (tile_data[:, :, i] == 0), i] = 0.1
|
||||
|
||||
# Calculate the position of the tile in the image data array.
|
||||
row_pos = (row - min_row) * tile_size
|
||||
col_pos = (col - min_col) * tile_size
|
||||
|
||||
# Insert the tile data into the image data array at the correct spot.
|
||||
image_data[row_pos:row_pos + tile_size, col_pos:col_pos + tile_size] = tile_data
|
||||
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.tiff_threads) as executor:
|
||||
futures = {executor.submit(build_tiff_data, task) for task in tiles}
|
||||
for future in tqdm(as_completed(futures), total=len(futures), desc='Building TIFF'):
|
||||
pass
|
||||
|
||||
# Transpose the image data array to the format (bands, rows, cols).
|
||||
image_data = np.transpose(image_data, (2, 0, 1))
|
||||
|
||||
# Convert geographic coordinates to Web Mercator coordinates. Not 100% sure this is nessesary.
|
||||
top_left_mx, top_left_my = lonlat_to_meters(top_left_lon, top_left_lat)
|
||||
bottom_right_mx, bottom_right_my = lonlat_to_meters(bottom_right_lon, bottom_right_lat)
|
||||
|
||||
# Define the transformation from pixel coordinates to geographic coordinates, which is an Affine transformation that
|
||||
# maps pixel coordinates in the image to geographic coordinates on the Earth's surface.
|
||||
transform = (Affine.translation(top_left_lon, top_left_lat) # Create a translation transformation that shifts the image and set the origin of the image to the top-left corner of the bounding box.
|
||||
# Create a scaling transformation that scales the image in the x and y directions to convert the pixel coordinates of the image to the geographic coordinates of the bounding box.
|
||||
* Affine.scale((bottom_right_lon - top_left_lon) / image_data.shape[2], (bottom_right_lat - top_left_lat) / image_data.shape[1]))
|
||||
transform = (Affine.translation(top_left_lon, top_left_lat)
|
||||
* Affine.scale((bottom_right_lon - top_left_lon) / (num_cols * tile_size),
|
||||
(bottom_right_lat - top_left_lat) / (num_rows * tile_size)))
|
||||
|
||||
# Write the image data to a GeoTIFF file
|
||||
print('Saving to:', output_tiff)
|
||||
start = time.time()
|
||||
with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype=str(image_data.dtype), crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst:
|
||||
dst.write(image_data, indexes=[1, 2, 3])
|
||||
print(f'Saved in {int(time.time() - start)} seconds.')
|
||||
|
||||
def worker(pbar):
|
||||
while True:
|
||||
row, col = q.get()
|
||||
if row is None:
|
||||
break
|
||||
|
||||
tile_file = tiles_output / f"{row}_{col}.png"
|
||||
if not tile_file.is_file():
|
||||
raise Exception(f'Tile does not exist: {tile_file}')
|
||||
|
||||
with Image.open(tile_file) as img:
|
||||
tile_data = np.array(img, dtype=np.uint8)
|
||||
|
||||
# Remove the alpha channel
|
||||
tile_data = tile_data[:, :, :3]
|
||||
|
||||
# Replace white pixels with NODATA
|
||||
tile_data[np.all(tile_data == [255, 255, 255], axis=-1)] = [0, 0, 0]
|
||||
|
||||
# ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1).
|
||||
mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero.
|
||||
for i in range(3): # Iterate over each band.
|
||||
# For these pixels, set zero bands to one.
|
||||
tile_data[mask & (tile_data[:, :, i] == 0), i] = 1
|
||||
|
||||
# Calculate the position of the tile in the image data array.
|
||||
row_pos = (row - min_row) * tile_size
|
||||
col_pos = (col - min_col) * tile_size
|
||||
|
||||
tile_data = np.transpose(tile_data, (2, 0, 1))
|
||||
|
||||
# Write the tile data to the GeoTIFF file
|
||||
with lock:
|
||||
dst.write(tile_data, window=rasterio.windows.Window(col_pos, row_pos, tile_size, tile_size), indexes=[1, 2, 3])
|
||||
|
||||
q.task_done()
|
||||
pbar.update()
|
||||
|
||||
|
||||
q = Queue()
|
||||
lock = threading.Lock()
|
||||
|
||||
with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype='uint8', crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst:
|
||||
with tqdm(total=len(tiles), desc='Building GeoTIFF') as pbar:
|
||||
threads = []
|
||||
for i in range(args.tiff_threads):
|
||||
t = threading.Thread(target=worker, args=(pbar,))
|
||||
t.start()
|
||||
threads.append(t)
|
||||
|
||||
for row, col in tiles:
|
||||
q.put((row, col))
|
||||
|
||||
# block until all tasks are done
|
||||
q.join()
|
||||
|
||||
# stop workers
|
||||
for i in range(args.tiff_threads):
|
||||
q.put((None, None))
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import PIL
|
||||
from PIL import Image
|
||||
|
||||
|
||||
|
@ -20,5 +21,6 @@ def is_png(file_path):
|
|||
try:
|
||||
img = Image.open(file_path)
|
||||
return img.format == 'PNG'
|
||||
except:
|
||||
except PIL.UnidentifiedImageError as e:
|
||||
# tqdm.write(str(e))
|
||||
return False
|
||||
|
|
|
@ -16,14 +16,14 @@ def del_path(p: Path):
|
|||
|
||||
|
||||
def download_tile(task):
|
||||
row, col, base_url, r_headers, output, use_proxy = task
|
||||
row, col, base_url, r_headers, output, use_proxy, no_download = task
|
||||
try:
|
||||
output_path: Path = output / f"{row}_{col}.png"
|
||||
if output_path.exists():
|
||||
if not is_png(output_path):
|
||||
if not no_download and not is_png(output_path):
|
||||
# Delete the file and try again.
|
||||
del_path(output_path)
|
||||
tqdm.write(f'{output_path} is not a PNG, deleting and retrying...')
|
||||
tqdm.write(f'cannot identify image file: "{output_path}", deleting and retrying...')
|
||||
else:
|
||||
return row, col, 'exist'
|
||||
tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://')
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
venv/bin/python3 exfiltrate.py \
|
||||
https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/ \
|
||||
--zoom 20 \
|
||||
--referer https://maps.nlsc.gov.tw/ \
|
||||
--bbox 25.076387 121.68951 25.068282 121.700175 \
|
||||
--threads 30 --tiff-threads 100 \
|
||||
--output ~/Downloads/wmts-output/ --no-download
|
Loading…
Reference in New Issue