https://github.com/ENCODE-DCC/encoded
Raw File
Tip revision: 19304c5e7c957a149892f0ccb0202aa74d44f49a authored by Keenan Graham on 11 November 2021, 03:38:03 UTC
Rename reference epigenomes, search mouse as well
Tip revision: 19304c5
LogToCsv.py
import re
#regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) "-" (.*?) "(.*?)" "(.*?)"'
pattern = re.compile('([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "-" "(.*?)" "(.*?)"')
pattern_trim = re.compile('([(\d\.)]+) - (.*?) \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (.*?)')

none_count = 0
elif_count = 0

file = open('encodeproject.org.log', 'r')

outputFile = open('encodeProjectToCSV.csv', 'w')

for line in file:
	result = re.search(pattern, line)
	result_trim = re.search(pattern_trim, line)
	
	if result:
		split_line = result.group(7)
		do_split = re.split(r'[-&?()]', split_line)
		completed_line = result.group(1,2,3,4,5,6), do_split

		outputFile.write(str(completed_line))
		outputFile.write("\n")
			
	elif not result:
		elif_count = +1
		split_line = result_trim.group(7)
		do_split = re.split(r'[-&?()]', split_line)
		completed_line = result_trim.group(1,3,4,5,6,8), do_split
		
		outputFile.write(str(completed_line))
		outputFile.write("\n")
	
	else:
		none_count = +1
		outputFile.write(line)
		
print("None count: %d" % none_count)
file.close()
outputFile.close()
back to top