https://github.com/ENCODE-DCC/encoded
Raw File
Tip revision: 508d7dfecfa86c1d95aee85ecebacd4ceb651625 authored by Emma O'Neill on 06 August 2019, 17:13:29 UTC
resolve QA bug
Tip revision: 508d7df
LogToCsv.py
import re
#regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) "-" (.*?) "(.*?)" "(.*?)"'
pattern = re.compile('([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "-" "(.*?)" "(.*?)"')
pattern_trim = re.compile('([(\d\.)]+) - (.*?) \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (.*?)')

none_count = 0
elif_count = 0

file = open('encodeproject.org.log', 'r')

outputFile = open('encodeProjectToCSV.csv', 'w')

for line in file:
	result = re.search(pattern, line)
	result_trim = re.search(pattern_trim, line)
	
	if result:
		split_line = result.group(7)
		do_split = re.split(r'[-&?()]', split_line)
		completed_line = result.group(1,2,3,4,5,6), do_split

		outputFile.write(str(completed_line))
		outputFile.write("\n")
			
	elif not result:
		elif_count = +1
		split_line = result_trim.group(7)
		do_split = re.split(r'[-&?()]', split_line)
		completed_line = result_trim.group(1,3,4,5,6,8), do_split
		
		outputFile.write(str(completed_line))
		outputFile.write("\n")
	
	else:
		none_count = +1
		outputFile.write(line)
		
print("None count: %d" % none_count)
file.close()
outputFile.close()
back to top