Compare commits

...

4 Commits

Author SHA1 Message Date
Cyberes 557985774d f 2023-11-06 19:10:44 -07:00
Cyberes 1d8b45fae4 test 2023-11-06 19:09:17 -07:00
Cyberes 65953c9bde add tiff threading 2023-11-06 18:28:44 -07:00
Cyberes 664eb1a52f optimize building geotiff 2023-11-06 17:48:09 -07:00
5 changed files with 109 additions and 70 deletions

View File

@ -2,13 +2,19 @@
_Scrape tiles from WMTS servers._ _Scrape tiles from WMTS servers._
You know what I hate? Those godforsaken WMTS servers, perched on their digital thrones, acting like they're the TILE TYRANTS of the universe. They think they can just LOCK UP their precious little tiles and keep me from doing my THING? HA! You know what I hate? Those godforsaken WMTS servers, perched on their digital thrones, acting like they're the TILE
TYRANTS of the universe. They think they can just LOCK UP their precious little tiles and keep me from doing my THING?
HA!
No more will these WMTS servers shroud their CRAPPY-ASS tiles in mystery. I'm coming for your DATA, you binary BASTARDS, and there's not a SINGLE 1 or 0 you can throw at me that will stop my CHARGE. No more will these WMTS servers shroud their CRAPPY-ASS tiles in mystery. I'm coming for your DATA, you binary BASTARDS,
and there's not a SINGLE 1 or 0 you can throw at me that will stop my CHARGE.
You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAIN. I'll slice through your defenses like a HOT PIZZA through COLD BUTTER. I'll have your DATA, and there's absolutely NOTHING, I repeat, NOTHING you can do to STOP ME. You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAIN. I'll slice through your defenses like
a HOT PIZZA through COLD BUTTER. I'll have your DATA, and there's absolutely NOTHING, I repeat, NOTHING you can do to
STOP ME.
So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm bringing a whole lot of CHAOS with me. So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm
bringing a whole lot of CHAOS with me.
### Install ### Install
@ -29,9 +35,12 @@ python3 exfiltrate.py \
--threads 30 --threads 30
``` ```
Building the GeoTIFF will take dozens of gigs of memory for any significant extent! For example, a 21 mile extent
required about 400GB of memory. You can use swap for this, but don't expect it to be very quick if you go this route.
### ArcGIS ### ArcGIS
??? TODO
### Credits ### Credits

View File

@ -1,8 +1,9 @@
import argparse import argparse
import base64 import base64
import time import threading
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path from pathlib import Path
from queue import Queue
import numpy as np import numpy as np
import rasterio import rasterio
@ -11,7 +12,7 @@ from rasterio import Affine
from tqdm import tqdm from tqdm import tqdm
from pkg.image import random_file_width from pkg.image import random_file_width
from pkg.spatial import deg2num, lonlat_to_meters from pkg.spatial import deg2num
from pkg.thread import download_tile from pkg.thread import download_tile
if __name__ == '__main__': if __name__ == '__main__':
@ -22,9 +23,11 @@ if __name__ == '__main__':
parser.add_argument('--referer', help='The content of the Referer header to send.') parser.add_argument('--referer', help='The content of the Referer header to send.')
parser.add_argument('--output', default='wmts-output', help='Output directory path.') parser.add_argument('--output', default='wmts-output', help='Output directory path.')
parser.add_argument('--proxy', action='store_true', help='Enable using a proxy.') parser.add_argument('--proxy', action='store_true', help='Enable using a proxy.')
parser.add_argument('--tiff-threads', default=None, help='Number of threads to use when building TIFF. Default: auto') parser.add_argument('--tiff-threads', default=10, type=int, help='Number of threads to use when building TIFF. Default: auto')
parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff') parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff')
parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)') parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)')
# parser.add_argument('--extent', default=None, help='Specify an extent to break the output image to. This is the diagonal.')
parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.")
args = parser.parse_args() args = parser.parse_args()
args.base_url = args.base_url.strip('/') + f'/{args.zoom}/' args.base_url = args.base_url.strip('/') + f'/{args.zoom}/'
@ -59,9 +62,14 @@ if __name__ == '__main__':
for row in row_iter: for row in row_iter:
row_i = row row_i = row
col_iter = range(min_col, max_col + 1) col_iter = range(min_col, max_col + 1)
col_bar = tqdm(total=len(col_iter), leave=False)
# if args.no_download:
# for col in col_iter:
# tiles.append((row, col))
# else:
with (ThreadPoolExecutor(args.threads) as executor): with (ThreadPoolExecutor(args.threads) as executor):
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter] col_bar = tqdm(total=len(col_iter), leave=False)
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy, args.no_download)) for col in col_iter]
for future in as_completed(futures): for future in as_completed(futures):
result = future.result() result = future.result()
if result: if result:
@ -76,8 +84,8 @@ if __name__ == '__main__':
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
col_bar.update() col_bar.update()
row_bar.refresh() row_bar.refresh()
col_bar.close() col_bar.close()
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
row_bar.update() row_bar.update()
row_bar.close() row_bar.close()
@ -98,66 +106,79 @@ if __name__ == '__main__':
print(f'Downloaded {total_downloaded} images.') print(f'Downloaded {total_downloaded} images.')
print('Preparing data...')
tile_size = random_file_width(tiles_output) tile_size = random_file_width(tiles_output)
# Define the number of rows and columns based on the bounding box # Define the number of rows and columns based on the bounding box
num_rows = max_row - min_row + 1 num_rows = max_row - min_row + 1
num_cols = max_col - min_col + 1 num_cols = max_col - min_col + 1
# Create an empty array to store the image data
image_data = np.empty((num_rows * tile_size, num_cols * tile_size, 3), dtype=np.uint8)
def build_tiff_data(task):
row, col = task
tile_file = tiles_output / f"{row}_{col}.png"
if not tile_file.is_file():
raise Exception(f'Tile does not exist: {tile_file}')
with Image.open(tile_file) as img:
tile_data = np.array(img)
# Remove the alpha channel
tile_data = tile_data[:, :, :3]
# Replace white pixels with NODATA
tile_data[np.all(tile_data == [255, 255, 255], axis=-1)] = [0, 0, 0]
# ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1).
mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero.
for i in range(3): # Iterate over each band.
# For these pixels, set zero bands to one.
tile_data[mask & (tile_data[:, :, i] == 0), i] = 0.1
# Calculate the position of the tile in the image data array.
row_pos = (row - min_row) * tile_size
col_pos = (col - min_col) * tile_size
# Insert the tile data into the image data array at the correct spot.
image_data[row_pos:row_pos + tile_size, col_pos:col_pos + tile_size] = tile_data
with ThreadPoolExecutor(max_workers=args.tiff_threads) as executor:
futures = {executor.submit(build_tiff_data, task) for task in tiles}
for future in tqdm(as_completed(futures), total=len(futures), desc='Building TIFF'):
pass
# Transpose the image data array to the format (bands, rows, cols).
image_data = np.transpose(image_data, (2, 0, 1))
# Convert geographic coordinates to Web Mercator coordinates. Not 100% sure this is nessesary.
top_left_mx, top_left_my = lonlat_to_meters(top_left_lon, top_left_lat)
bottom_right_mx, bottom_right_my = lonlat_to_meters(bottom_right_lon, bottom_right_lat)
# Define the transformation from pixel coordinates to geographic coordinates, which is an Affine transformation that # Define the transformation from pixel coordinates to geographic coordinates, which is an Affine transformation that
# maps pixel coordinates in the image to geographic coordinates on the Earth's surface. # maps pixel coordinates in the image to geographic coordinates on the Earth's surface.
transform = (Affine.translation(top_left_lon, top_left_lat) # Create a translation transformation that shifts the image and set the origin of the image to the top-left corner of the bounding box. transform = (Affine.translation(top_left_lon, top_left_lat)
# Create a scaling transformation that scales the image in the x and y directions to convert the pixel coordinates of the image to the geographic coordinates of the bounding box. * Affine.scale((bottom_right_lon - top_left_lon) / (num_cols * tile_size),
* Affine.scale((bottom_right_lon - top_left_lon) / image_data.shape[2], (bottom_right_lat - top_left_lat) / image_data.shape[1])) (bottom_right_lat - top_left_lat) / (num_rows * tile_size)))
# Write the image data to a GeoTIFF file
print('Saving to:', output_tiff) def worker(pbar):
start = time.time() while True:
with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype=str(image_data.dtype), crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst: row, col = q.get()
dst.write(image_data, indexes=[1, 2, 3]) if row is None:
print(f'Saved in {int(time.time() - start)} seconds.') break
tile_file = tiles_output / f"{row}_{col}.png"
if not tile_file.is_file():
raise Exception(f'Tile does not exist: {tile_file}')
with Image.open(tile_file) as img:
tile_data = np.array(img, dtype=np.uint8)
# Remove the alpha channel
tile_data = tile_data[:, :, :3]
# Replace white pixels with NODATA
tile_data[np.all(tile_data == [255, 255, 255], axis=-1)] = [0, 0, 0]
# ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1).
mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero.
for i in range(3): # Iterate over each band.
# For these pixels, set zero bands to one.
tile_data[mask & (tile_data[:, :, i] == 0), i] = 1
# Calculate the position of the tile in the image data array.
row_pos = (row - min_row) * tile_size
col_pos = (col - min_col) * tile_size
tile_data = np.transpose(tile_data, (2, 0, 1))
# Write the tile data to the GeoTIFF file
with lock:
dst.write(tile_data, window=rasterio.windows.Window(col_pos, row_pos, tile_size, tile_size), indexes=[1, 2, 3])
q.task_done()
pbar.update()
q = Queue()
lock = threading.Lock()
with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype='uint8', crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst:
with tqdm(total=len(tiles), desc='Building GeoTIFF') as pbar:
threads = []
for i in range(args.tiff_threads):
t = threading.Thread(target=worker, args=(pbar,))
t.start()
threads.append(t)
for row, col in tiles:
q.put((row, col))
# block until all tasks are done
q.join()
# stop workers
for i in range(args.tiff_threads):
q.put((None, None))
for t in threads:
t.join()

View File

@ -1,6 +1,7 @@
import random import random
from pathlib import Path from pathlib import Path
import PIL
from PIL import Image from PIL import Image
@ -20,5 +21,6 @@ def is_png(file_path):
try: try:
img = Image.open(file_path) img = Image.open(file_path)
return img.format == 'PNG' return img.format == 'PNG'
except: except PIL.UnidentifiedImageError as e:
# tqdm.write(str(e))
return False return False

View File

@ -16,14 +16,14 @@ def del_path(p: Path):
def download_tile(task): def download_tile(task):
row, col, base_url, r_headers, output, use_proxy = task row, col, base_url, r_headers, output, use_proxy, no_download = task
try: try:
output_path: Path = output / f"{row}_{col}.png" output_path: Path = output / f"{row}_{col}.png"
if output_path.exists(): if output_path.exists():
if not is_png(output_path): if not no_download and not is_png(output_path):
# Delete the file and try again. # Delete the file and try again.
del_path(output_path) del_path(output_path)
tqdm.write(f'{output_path} is not a PNG, deleting and retrying...') tqdm.write(f'cannot identify image file: "{output_path}", deleting and retrying...')
else: else:
return row, col, 'exist' return row, col, 'exist'
tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://') tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://')

7
test.sh Normal file
View File

@ -0,0 +1,7 @@
venv/bin/python3 exfiltrate.py \
https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/ \
--zoom 20 \
--referer https://maps.nlsc.gov.tw/ \
--bbox 25.076387 121.68951 25.068282 121.700175 \
--threads 30 --tiff-threads 100 \
--output ~/Downloads/wmts-output/ --no-download