diff options
Diffstat (limited to 'lib/support/mysql-postgresql-converter/db_converter.py')
-rw-r--r-- | lib/support/mysql-postgresql-converter/db_converter.py | 256 |
1 files changed, 256 insertions, 0 deletions
diff --git a/lib/support/mysql-postgresql-converter/db_converter.py b/lib/support/mysql-postgresql-converter/db_converter.py new file mode 100644 index 0000000..8a7f8a2 --- /dev/null +++ b/lib/support/mysql-postgresql-converter/db_converter.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python + +""" +Fixes a MySQL dump made with the right format so it can be directly +imported to a new PostgreSQL database. + +Dump using: +mysqldump --compatible=postgresql --default-character-set=utf8 -r databasename.mysql -u root databasename +""" + +import re +import sys +import os +import time +import subprocess + + +def parse(input_filename, output_filename): + "Feed it a file, and it'll output a fixed one" + + # State storage + if input_filename == "-": + num_lines = -1 + else: + num_lines = int(subprocess.check_output(["wc", "-l", input_filename]).strip().split()[0]) + tables = {} + current_table = None + creation_lines = [] + enum_types = [] + foreign_key_lines = [] + index_lines = [] + drop_index_lines = [] + sequence_lines = [] + cast_lines = [] + num_inserts = 0 + started = time.time() + + # Open output file and write header. Logging file handle will be stdout + # unless we're writing output to stdout, in which case NO PROGRESS FOR YOU. + if output_filename == "-": + output = sys.stdout + logging = open(os.devnull, "w") + else: + output = open(output_filename, "w") + logging = sys.stdout + + if input_filename == "-": + input_fh = sys.stdin + else: + input_fh = open(input_filename) + + + output.write("-- Converted by db_converter\n") + output.write("START TRANSACTION;\n") + output.write("SET standard_conforming_strings=off;\n") + output.write("SET escape_string_warning=off;\n") + output.write("SET CONSTRAINTS ALL DEFERRED;\n\n") + + for i, line in enumerate(input_fh): + time_taken = time.time() - started + percentage_done = (i+1) / float(num_lines) + secs_left = (time_taken / percentage_done) - time_taken + logging.write("\rLine %i (of %s: %.2f%%) [%s tables] [%s inserts] [ETA: %i min %i sec]" % ( + i + 1, + num_lines, + ((i+1)/float(num_lines))*100, + len(tables), + num_inserts, + secs_left // 60, + secs_left % 60, + )) + logging.flush() + line = line.decode("utf8").strip().replace(r"\\", "WUBWUBREALSLASHWUB").replace(r"\0", "").replace(r"\'", "''").replace("WUBWUBREALSLASHWUB", r"\\") + # Ignore comment lines + if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES") or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") or not line: + continue + + # Outside of anything handling + if current_table is None: + # Start of a table creation statement? + if line.startswith("CREATE TABLE"): + current_table = line.split('"')[1] + tables[current_table] = {"columns": []} + creation_lines = [] + # Inserting data into a table? + elif line.startswith("INSERT INTO"): + output.write(re.sub(r"([^'])'0000-00-00 00:00:00'", r"\1NULL", line.encode("utf8")) + "\n") + num_inserts += 1 + # ??? + else: + print "\n ! Unknown line in main body: %s" % line + + # Inside-create-statement handling + else: + # Is it a column? + if line.startswith('"'): + useless, name, definition = line.strip(",").split('"',2) + try: + type, extra = definition.strip().split(" ", 1) + + # This must be a tricky enum + if ')' in extra: + type, extra = definition.strip().split(")") + + except ValueError: + type = definition.strip() + extra = "" + extra = re.sub("CHARACTER SET [\w\d]+\s*", "", extra.replace("unsigned", "")) + extra = re.sub("COLLATE [\w\d]+\s*", "", extra.replace("unsigned", "")) + + # See if it needs type conversion + final_type = None + final_default = None + set_sequence = None + if type == "tinyint(1)": + type = "int4" + set_sequence = True + final_type = "boolean" + + if "DEFAULT '0'" in extra: + final_default = "FALSE" + elif "DEFAULT '1'" in extra: + final_default = "TRUE" + + elif type.startswith("int("): + type = "integer" + set_sequence = True + elif type.startswith("bigint("): + type = "bigint" + set_sequence = True + elif type == "longtext": + type = "text" + elif type == "mediumtext": + type = "text" + elif type == "tinytext": + type = "text" + elif type.startswith("varchar("): + size = int(type.split("(")[1].rstrip(")")) + type = "varchar(%s)" % (size * 2) + elif type.startswith("smallint("): + type = "int2" + set_sequence = True + elif type == "datetime": + type = "timestamp with time zone" + extra = extra.replace("NOT NULL", "") + elif type == "double": + type = "double precision" + elif type == "blob": + type = "bytea" + elif type.startswith("enum(") or type.startswith("set("): + + types_str = type.split("(")[1].rstrip(")").rstrip('"') + types_arr = [type_str.strip('\'') for type_str in types_str.split(",")] + + # Considered using values to make a name, but its dodgy + # enum_name = '_'.join(types_arr) + enum_name = "{0}_{1}".format(current_table, name) + + if enum_name not in enum_types: + output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str)); + enum_types.append(enum_name) + + type = enum_name + + if final_type: + cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" DROP DEFAULT" % (current_table, name)) + cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % (current_table, name, final_type, name, final_type)) + if final_default: + cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" SET DEFAULT %s" % (current_table, name, final_default)) + # ID fields need sequences [if they are integers?] + if name == "id" and set_sequence is True: + sequence_lines.append("DROP SEQUENCE IF EXISTS %s_id_seq" % (current_table)) + sequence_lines.append("CREATE SEQUENCE %s_id_seq" % (current_table)) + sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % (current_table, current_table)) + sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % (current_table, current_table)) + # Record it + creation_lines.append('"%s" %s %s' % (name, type, extra)) + tables[current_table]['columns'].append((name, type, extra)) + # Is it a constraint or something? + elif line.startswith("PRIMARY KEY"): + creation_lines.append(line.rstrip(",")) + elif line.startswith("CONSTRAINT"): + foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % (current_table, line.split("CONSTRAINT")[1].strip().rstrip(","))) + foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % (current_table, line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(","))) + elif line.startswith("UNIQUE KEY \""): + index_name = line.split('"')[1].split('"')[0] + index_columns = line.split("(")[1].split(")")[0] + index_lines.append("CREATE UNIQUE INDEX \"%s\" ON %s (%s)" % (index_name, current_table, index_columns)) + drop_index_lines.append("DROP INDEX IF EXISTS \"%s\"" % index_name) + elif line.startswith("UNIQUE KEY"): + index_columns = line.split("(")[1].split(")")[0] + index_lines.append("CREATE UNIQUE INDEX ON %s (%s)" % (current_table, index_columns)) + elif line.startswith("KEY \""): + index_name = line.split('"')[1].split('"')[0] + index_columns = line.split("(")[1].split(")")[0] + index_lines.append("CREATE INDEX \"%s\" ON %s (%s)" % (index_name, current_table, index_columns)) + drop_index_lines.append("DROP INDEX IF EXISTS \"%s\"" % index_name) + elif line.startswith("KEY"): + index_columns = line.split("(")[1].split(")")[0] + index_lines.append("CREATE INDEX ON %s (%s)" % (current_table, index_columns)) + elif line.startswith("FULLTEXT KEY"): + fulltext_keys = " || ' ' || ".join( line.split('(')[-1].split(')')[0].replace('"', '').split(',') ) + index_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % (current_table, fulltext_keys)) + # Is it the end of the table? + elif line == ");": + output.write("DROP TABLE IF EXISTS \"%s\";\n" % current_table) + output.write("CREATE TABLE \"%s\" (\n" % current_table) + for i, line in enumerate(creation_lines): + output.write(" %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else "")) + output.write(');\n\n') + current_table = None + # ??? + else: + print "\n ! Unknown line inside table creation: %s" % line + + + # Finish file + output.write("\n-- Post-data save --\n") + output.write("COMMIT;\n") + output.write("START TRANSACTION;\n") + + # Write typecasts out + output.write("\n-- Typecasts --\n") + for line in cast_lines: + output.write("%s;\n" % line) + + # Write FK constraints out + output.write("\n-- Foreign keys --\n") + for line in foreign_key_lines: + output.write("%s;\n" % line) + + # Write sequences out + output.write("\n-- Sequences --\n") + for line in sequence_lines: + output.write("%s;\n" % line) + + # This line is an anchor for move_drop_indexes.ed + output.write("\n-- Drop indexes --\n") + for line in drop_index_lines: + output.write("%s;\n" % line) + # This line is an anchor for move_drop_indexes.ed + output.write("-- END Drop indexes --\n") + + # Write indexes out + output.write("\n-- Indexes --\n") + for line in index_lines: + output.write("%s;\n" % line) + + # Finish file + output.write("\n") + output.write("COMMIT;\n") + print "" + + +if __name__ == "__main__": + parse(sys.argv[1], sys.argv[2]) |