# Author: Rubinigg Michael
# Contributor: Rubinigg Michael
# Abstract: Assesses all column names that are being used in a list of selected CSV files and merges the content of all files in the correct column of a single output file
# Version 1.0
# Release date: 2024-10-16
# Place: Graz, Austria, EU
# Company: dr Mag. rer. nat. Michael Rubinigg
# Programming Language: Python 3.11
# Rights:  Creative Commons Attribution 4.0 (CC-BY)

import os
import csv
import tkinter as tk
import pandas as pd
from tkinter import filedialog, messagebox, simpledialog, ttk
from tqdm import tqdm

def merge_files(selected_files, selected_headers, output_directory, output_filename, source_information, separator_input, decimal_input):

    master_headerlist = []
    if source_information == True:
        master_headerlist = ['raw_data_file']

    # write content from input file to output file
    output_path = os.path.join(output_directory, output_filename + '.csv')
    header_path = os.path.join(output_directory, output_filename + '_header.csv')

    # write a protocol on each input file that has been merged into the output file
    protocol_output_path = os.path.join(output_directory, output_filename + '_protocol.csv')

    # create output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # delete the output file if it exists
    try:
        if os.path.exists(output_path):
            os.remove(output_path)
    except Exception as e:
        print(f"An error occurred: {e}")
    # delete the output file if it exists
    try:
        if os.path.exists(protocol_output_path):
            os.remove(protocol_output_path)
    except Exception as e:
        print(f"An error occurred: {e}")

    # Get a list of all column headers that are being used in al files to be merged
    for file_path in tqdm(selected_headers, desc='Getting headers', unit='file'):

        df = pd.read_csv(file_path, nrows=0, delimiter=separator_input, decimal=decimal_input)
        header = list(df.columns)

        # write an ordered master list
        for item in header:
            if item not in master_headerlist:
                master_headerlist.append(item)

    # Convert list B into a DataFrame with a single row to save it as a header
    header_df = pd.DataFrame(columns=master_headerlist)
    header_df.to_csv(output_path, sep=';', index=False)

    # Read content from selected input files
    for file_path in tqdm(selected_files, desc='Read content from input files', unit='file'):

        # Read the CSV files
        df_raw = pd.read_csv(file_path, delimiter=separator_input, decimal=decimal_input)

        df_raw['raw_data_file'] = os.path.basename(file_path)

        # Reorder columns of df_raw to match df_out, fill missing columns with NULL (if any)
        df_raw_reordered = df_raw.reindex(columns=master_headerlist)
        df_raw_reordered.fillna('NULL', inplace=True)

        # Save the reordered dataframe
        df_raw_reordered.to_csv(output_path, sep=';', decimal='.', mode='a', index=False, header=False)

        # Write the protocol
        protocol_row = [file_path, len(df_raw)]
        protocol_df = pd.DataFrame(columns=protocol_row)
        protocol_df.to_csv(protocol_output_path, mode='a', sep=';', index=False)

    try:
        # Check if file exists before deleting
        if os.path.isfile(header_path):
            os.remove(header_path)
    except Exception as e:
        print(f"An error occurred: {e}")

def select_files(title_text):
    file_paths = filedialog.askopenfilenames(
        title=title_text,
        filetypes=(("CSV files", "*.csv"),)
    )
    return file_paths

def select_output_directory():
    output_directory = filedialog.askdirectory(title="Select Output Directory")
    return output_directory

def get_output_filename():
    output_filename = simpledialog.askstring("Name of the output file", "Enter the output filename (excluding extension):")
    return output_filename

def get_source_information():
    source_information = messagebox.askyesno("Question", "Do you want to track the raw data file in the dataset? ")
    return source_information

def get_listvalue(options, title_text):

    # Create a main window
    root = tk.Tk()
    root.attributes('-topmost', True)
    root.eval('tk::PlaceWindow . center')

    # create label
    label = ttk.Label(text=title_text)
    label.pack(padx=5, pady=5)

    # Create a StringVar to hold the selected value
    selected_value = tk.StringVar()

    # Create a combobox
    combobox = ttk.Combobox(root, textvariable=selected_value)
    combobox['values'] = options
    combobox.current(0)  # Set the default selection to the first item
    combobox.pack(padx=10, pady=10)

    # Function to handle the button click
    def on_submit():
        root.destroy()  # Close the window

    # Create a submit button
    submit_button = tk.Button(root, text="Submit", command=on_submit)
    submit_button.pack(padx=10, pady=10)

    # Run the application
    root.mainloop()

    # Return the selected value
    return selected_value.get()


def main():

    # Select source files to merge
    selected_headers = select_files("Select source files for the header")

    # Select source files to merge
    selected_files = select_files("Select source files for the data")

    # Select output directory
    output_directory = select_output_directory()

    # Get user-defined output filename
    output_filename = get_output_filename()

    # get information on headers
    source_information = get_source_information()

    # get the format of the decimal point
    decimal_input = get_listvalue([',', '.'], 'Select the decimal in the input file')

    # get the column delimiter character
    separator_input = get_listvalue([',', ';'], 'Select the column separator in the input file')

    # Merge selected text files and save to the output file
    merge_files(selected_files, selected_headers, output_directory, output_filename, source_information, separator_input, decimal_input)

if __name__ == "__main__":
    main()