https://github.com/ENCODE-DCC/encoded
Raw File
Tip revision: 8ce8e2830f38c39fd9ed6f93c9f4a2e6418468fe authored by Philip Adenekan on 07 April 2020, 02:42:05 UTC
test
Tip revision: 8ce8e28
LogToCsv.py
import re
#regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) "-" (.*?) "(.*?)" "(.*?)"'
pattern = re.compile('([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "-" "(.*?)" "(.*?)"')
pattern_trim = re.compile('([(\d\.)]+) - (.*?) \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (.*?)')

none_count = 0
elif_count = 0

file = open('encodeproject.org.log', 'r')

outputFile = open('encodeProjectToCSV.csv', 'w')

for line in file:
	result = re.search(pattern, line)
	result_trim = re.search(pattern_trim, line)
	
	if result:
		split_line = result.group(7)
		do_split = re.split(r'[-&?()]', split_line)
		completed_line = result.group(1,2,3,4,5,6), do_split

		outputFile.write(str(completed_line))
		outputFile.write("\n")
			
	elif not result:
		elif_count = +1
		split_line = result_trim.group(7)
		do_split = re.split(r'[-&?()]', split_line)
		completed_line = result_trim.group(1,3,4,5,6,8), do_split
		
		outputFile.write(str(completed_line))
		outputFile.write("\n")
	
	else:
		none_count = +1
		outputFile.write(line)
		
print("None count: %d" % none_count)
file.close()
outputFile.close()
back to top