From 8fef56fa766a0a70f715494a5ec499db60b27542 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Thu, 2 Nov 2023 23:35:43 -0600 Subject: [PATCH] add code --- .gitignore | 4 +- exfiltrate.py | 127 +++++++++++++++++++++++++++++++++++++++++++++++ pkg/__init__.py | 0 pkg/image.py | 21 ++++++++ pkg/proxies.py | 4 ++ pkg/spatial.py | 35 +++++++++++++ pkg/thread.py | 23 +++++++++ requirements.txt | 5 ++ 8 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 exfiltrate.py create mode 100644 pkg/__init__.py create mode 100644 pkg/image.py create mode 100644 pkg/proxies.py create mode 100644 pkg/spatial.py create mode 100644 pkg/thread.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 5d381cc..6ad7ca9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.idea/ +wmts-output/ + # ---> Python # Byte-compiled / optimized / DLL files __pycache__/ @@ -159,4 +162,3 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ - diff --git a/exfiltrate.py b/exfiltrate.py new file mode 100644 index 0000000..83688f5 --- /dev/null +++ b/exfiltrate.py @@ -0,0 +1,127 @@ +import argparse +import base64 +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import numpy as np +import rasterio +from PIL import Image +from rasterio import Affine +from tqdm import tqdm + +from pkg.image import random_file_width +from pkg.spatial import deg2num +from pkg.thread import download_tile + +# TODO: support reusing TIFFs + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Exfiltrate data from WMS servers.') + parser.add_argument('base_url', help='The base URL for the WMS server. Example: https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/') + parser.add_argument('--zoom', type=int, required=True, help='The zoom level to use.') + parser.add_argument('--threads', type=int, default=10, help='Number of download threads to use.') + parser.add_argument('--referer', help='The content of the Referer header to send.') + parser.add_argument('--output', default='wmts-output', help='Output directory path.') + parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff') + parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)') + args = parser.parse_args() + + args.base_url = args.base_url.strip('/') + f'/{args.zoom}/' + base_output = Path(args.output).resolve().absolute().expanduser() + url_hash = base64.b64encode(args.base_url.encode()).decode('utf-8').strip('==') + tiles_output = base_output / url_hash / str(args.zoom) + tiles_output.mkdir(parents=True, exist_ok=True) + + top_left_lat, top_left_lon, bottom_right_lat, bottom_right_lon = map(float, args.bbox) + min_col, min_row = deg2num(top_left_lat, top_left_lon, args.zoom) + max_col, max_row = deg2num(bottom_right_lat, bottom_right_lon, args.zoom) + + if args.output_tiff: + output_tiff = Path(args.output_tiff) + else: + output_tiff = base_output / f'output-z{args.zoom}-{top_left_lat}x{top_left_lon}-{bottom_right_lat}x{bottom_right_lon}.tiff' + + r_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', + 'Accept': 'image/avif,image/webp,*/*', + 'Accept-Language': 'en-US,en;q=0.5' + } + if args.referer: + r_headers['Referer'] = args.referer + + tiles = [] + total_downloaded = 0 + row_i = min_row + for row in tqdm(range(min_row, max_row + 1), desc=f'Row {row_i}'): + row_i = row + bar = tqdm(total=len(range(min_col, max_col + 1)), leave=False) + with ThreadPoolExecutor(args.threads) as executor: + futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output)) for col in range(min_col, max_col + 1)] + for future in as_completed(futures): + result = future.result() + if result: + result_row, result_col, new_image = result + tiles.append((result_row, result_col)) + if new_image: + total_downloaded += 1 + bar.update() + bar.close() + + print(f'Downloaded {total_downloaded} images.') + + tile_size = random_file_width(tiles_output) + + # Define the number of rows and columns based on the bounding box + num_rows = max_row - min_row + 1 + num_cols = max_col - min_col + 1 + + # Create an empty array to store the image data + image_data = np.empty((num_rows * tile_size, num_cols * tile_size, 3), dtype=np.uint8) + + + def build_tiff_data(task): + row, col = task + # try: + # row, col = map(int, file.name[:-4].split("_")) + # except: + # print('Bad file:', file) + # return + tile_file = tiles_output / f"{row}_{col}.png" + if not tile_file.is_file(): + raise Exception(f'Tile does not exist: {tile_file}') + + with Image.open(tile_file) as img: + tile_data = np.array(img) + + # Remove the alpha channel + tile_data = tile_data[:, :, :3] + + # Calculate the position of the tile in the image data array + row_pos = (row - min_row) * tile_size + col_pos = (col - min_col) * tile_size + + # Insert the tile data into the image data array + image_data[row_pos:row_pos + tile_size, col_pos:col_pos + tile_size] = tile_data + + + with ThreadPoolExecutor() as executor: + futures = {executor.submit(build_tiff_data, task) for task in tiles} + for future in tqdm(as_completed(futures), total=len(futures), desc='Building TIFF'): + pass + + # Transpose the image data array to the format (bands, rows, cols) + image_data = np.transpose(image_data, (2, 0, 1)) + + # Define the transformation from pixel coordinates to CRS coordinates + transform = Affine.translation(top_left_lon, top_left_lat) * Affine.scale((bottom_right_lon - top_left_lon) / (num_cols * tile_size), (bottom_right_lat - top_left_lat) / (num_rows * tile_size)) + + # Define the CRS + crs = rasterio.crs.CRS.from_string("EPSG:3857") + + # Write the image data to a GeoTIFF file + print('Saving to:', output_tiff) + start = time.time() + with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype=str(image_data.dtype), crs=crs, transform=transform, compress="DEFLATE") as dst: + dst.write(image_data, indexes=[1, 2, 3]) + print(f'Saved in {int(time.time() - start)} seconds.') diff --git a/pkg/__init__.py b/pkg/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pkg/image.py b/pkg/image.py new file mode 100644 index 0000000..2a9b9d6 --- /dev/null +++ b/pkg/image.py @@ -0,0 +1,21 @@ +import random +from pathlib import Path + +from PIL import Image + + +def random_file_width(base_path: Path): + random_tile_file = random.choice(list(base_path.glob('*'))) + with Image.open(random_tile_file) as img: + return img.size[0] + + +def is_png(file_path): + """ + Make sure an image is a valid PNG. + Raise exeption if could not load image. + :param file_path: + :return: + """ + img = Image.open(file_path) + return img.format == 'PNG' diff --git a/pkg/proxies.py b/pkg/proxies.py new file mode 100644 index 0000000..46fb266 --- /dev/null +++ b/pkg/proxies.py @@ -0,0 +1,4 @@ +PROXIES = { + "http": 'http://172.0.4.7:9000', + "https": 'http://172.0.4.7:9000', +} diff --git a/pkg/spatial.py b/pkg/spatial.py new file mode 100644 index 0000000..fe6c7b5 --- /dev/null +++ b/pkg/spatial.py @@ -0,0 +1,35 @@ +from rasterio import Affine + +import math + + +def deg2num(lat_deg, lon_deg, zoom): + lat_rad = math.radians(lat_deg) + n = 2.0 ** zoom + xtile = int((lon_deg + 180.0) / 360.0 * n) + ytile = int((1.0 - math.asinh(math.tan(lat_rad)) / math.pi) / 2.0 * n) + return xtile, ytile + + +def calculate_transform(args, min_col, max_col, min_row, max_row): + # Constants for Web Mercator (EPSG:3857) + tile_size_in_pixels = 256 + earth_circumference = 40075016.686 # in meters + + # Calculate the size of a tile in meters at the given zoom level + tile_size_in_meters = earth_circumference / (2 ** args.zoom) + + # Calculate the geographic coordinates of the bounding box + min_x = min_col * tile_size_in_meters + max_x = (max_col + 1) * tile_size_in_meters + min_y = (2 ** args.zoom - max_row - 1) * tile_size_in_meters + max_y = (2 ** args.zoom - min_row) * tile_size_in_meters + + # Calculate the pixel size in x and y directions + pixel_size_x = tile_size_in_meters / tile_size_in_pixels + pixel_size_y = tile_size_in_meters / tile_size_in_pixels + + # Define the transformation + transform = Affine.translation(min_x, min_y) * Affine.scale(pixel_size_x, -pixel_size_y) + + return transform diff --git a/pkg/thread.py b/pkg/thread.py new file mode 100644 index 0000000..31e9010 --- /dev/null +++ b/pkg/thread.py @@ -0,0 +1,23 @@ +import requests + +from pkg.proxies import PROXIES +from .image import is_png + + +def download_tile(task): + row, col, base_url, r_headers, output = task + output_path = output / f"{row}_{col}.png" + if output_path.exists(): + assert is_png(output_path) + return row, col, False + tile_url = f"{base_url}/{row}/{col}" + response = requests.get(tile_url, headers=r_headers, proxies=PROXIES) + if response.status_code == 200: + if not response.headers.get('Content-Type') == 'image/png': + raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}') + with open(output_path, "wb") as f: + f.write(response.content) + assert is_png(output_path) + return row, col, True + else: + print(f"Failed to download tile {row}_{col}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cf8cc63 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests==2.31.0 +tqdm==4.66.1 +numpy==1.26.1 +pillow==10.1.0 +rasterio==1.3.9 \ No newline at end of file