Commit 5fbd75f2be46124920d671c364c862c907491836 - poetry - (('Mo Repos',), {'htdigest_file': None, 'use_smarthttp': 0, 'require_browser_auth': 0, 'disable_push': 0, 'unauthenticated_push': 0, 'ctags

yangsaisai 7 years ago

2 changed file(s) with 384 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all

+262

-0

.ipynb_checkpoints/data_loader-checkpoint.py less more

	0	import os
	1	#import collections
	2	from six.moves import cPickle
	3	import numpy as np
	4	from word2vec_helper import Word2Vec
	5	import math
	6
	7
	8
	9	class DataLoader():
	10	def __init__(self, data_dir, batch_size,seq_max_length,w2v,data_type):
	11	self.data_dir = data_dir
	12	self.batch_size = batch_size
	13	self.seq_max_length = seq_max_length
	14	self.w2v = w2v
	15	self.trainingSamples = []
	16	self.validationSamples = []
	17	self.testingSamples = []
	18	self.train_frac = 0.85
	19	self.valid_frac = 0.05
	20
	21	self.load_corpus(self.data_dir)
	22
	23	if data_type == 'train':
	24	self.create_batches(self.trainingSamples)
	25	elif data_type == 'test':
	26	self.create_batches(self.testingSamples)
	27	elif data_type == 'valid':
	28	self.create_batches(self.validationSamples)
	29
	30	self.reset_batch_pointer()
	31
	32	def _print_stats(self):
	33	print('Loaded {}: training samples:{} ,validationSamples:{},testingSamples:{}'.format(
	34	self.data_dir, len(self.trainingSamples),len(self.validationSamples),len(self.testingSamples)))
	35
	36	def load_corpus(self,base_path):
	37	"""读/创建对话数据：
	38	在训练文件创建的过程中，由两个文件
	39	1. self.fullSamplePath
	40	2. self.filteredSamplesPath
	41	"""
	42	tensor_file = os.path.join(base_path,'poem_ids.txt')
	43	print('tensor_file:%s' % tensor_file)
	44
	45	datasetExist = os.path.isfile(tensor_file)
	46	# 如果处理过的对话数据文件不存在，创建数据文件
	47	if not datasetExist:
	48	print('训练样本不存在。从原始样本数据集创建训练样本...')
	49
	50	fullSamplesPath = os.path.join(self.data_dir,'poems_edge_split.txt')
	51	# 创建/读取原始对话样本数据集： self.trainingSamples
	52	print('fullSamplesPath:%s' % fullSamplesPath)
	53	self.load_from_text_file(fullSamplesPath)
	54
	55	else:
	56	self.load_dataset(tensor_file)
	57
	58	self.padToken = self.w2v.ix('<pad>')
	59	self.goToken = self.w2v.ix('[')
	60	self.eosToken = self.w2v.ix(']')
	61	self.unknownToken = self.w2v.ix('<unknown>')
	62
	63	self._print_stats()
	64	# assert self.padToken == 0
	65
	66	def load_from_text_file(self,in_file):
	67	# base_path = 'F:\BaiduYunDownload\chatbot_lecture\lecture2\data\ice_and_fire_zh'
	68	# in_file = os.path.join(base_path,'poems_edge.txt')
	69	fr = open(in_file, "r",encoding='utf-8')
	70	poems = fr.readlines()
	71	fr.close()
	72
	73	print("唐诗总数： %d"%len(poems))
	74	# self.seq_max_length = max([len(poem) for poem in poems])
	75	# print("seq_max_length： %d"% (self.seq_max_length))
	76
	77	poem_ids = DataLoader.get_text_idx(poems,self.w2v.vocab_hash,self.seq_max_length)
	78
	79	# # 后续处理
	80	# # 1. 单词过滤，去掉不常见(<=filterVocab)的单词，保留最常见的vocabSize个单词
	81	# print('Filtering words (vocabSize = {} and wordCount > {})...'.format(
	82	# self.args.vocabularySize,
	83	# self.args.filterVocab
	84	# ))
	85	# self.filterFromFull()
	86
	87	# 2. 分割数据
	88	print('分割数据为 train, valid, test 数据集...')
	89	n_samples = len(poem_ids)
	90	train_size = int(self.train_frac * n_samples)
	91	valid_size = int(self.valid_frac * n_samples)
	92	test_size = n_samples - train_size - valid_size
	93
	94	print('n_samples=%d, train-size=%d, valid_size=%d, test_size=%d' % (
	95	n_samples, train_size, valid_size, test_size))
	96	self.testingSamples = poem_ids[-test_size:]
	97	self.validationSamples = poem_ids[-valid_size-test_size : -test_size]
	98	self.trainingSamples = poem_ids[:train_size]
	99
	100	# 保存处理过的训练数据集
	101	print('Saving dataset...')
	102	poem_ids_file = os.path.join(self.data_dir,'poem_ids.txt')
	103	self.save_dataset(poem_ids_file)
	104
	105	# 2. utility 函数，使用pickle写文件
	106	def save_dataset(self, filename):
	107	"""使用pickle保存数据文件。
	108
	109	数据文件包含词典和对话样本。
	110
	111	Args:
	112	filename (str): pickle 文件名
	113	"""
	114	with open(filename, 'wb') as handle:
	115	data = {
	116	'trainingSamples': self.trainingSamples
	117	}
	118
	119	if len(self.validationSamples)>0:
	120	data['validationSamples'] = self.validationSamples
	121	data['testingSamples'] = self.testingSamples
	122	data['maxSeqLen'] = self.seq_max_length
	123
	124	cPickle.dump(data, handle, -1) # Using the highest protocol available
	125
	126	# 3. utility 函数，使用pickle读文件
	127	def load_dataset(self, filename):
	128	"""使用pickle读入数据文件
	129	Args:
	130	filename (str): pickle filename
	131	"""
	132
	133	print('Loading dataset from {}'.format(filename))
	134	with open(filename, 'rb') as handle:
	135	data = cPickle.load(handle)
	136	self.trainingSamples = data['trainingSamples']
	137
	138	if 'validationSamples' in data:
	139	self.validationSamples = data['validationSamples']
	140	self.testingSamples = data['testingSamples']
	141
	142	print('file maxSeqLen = {}'.format( data['maxSeqLen']))
	143
	144
	145	@classmethod
	146	def get_text_idx(text,vocab,max_document_length):
	147	text_array = []
	148	for i,x in enumerate(text):
	149	line = []
	150	for j, w in enumerate(x):
	151	if (w not in vocab):
	152	w = '<unknown>'
	153	line.append(vocab[w])
	154	text_array.append(line)
	155	# else :
	156	# print w,'not exist'
	157
	158	return text_array
	159
	160	def create_batches(self,samples):
	161
	162	sample_size = len(samples)
	163	self.num_batches = math.ceil(sample_size /self.batch_size)
	164	new_sample_size = self.num_batches * self.batch_size
	165
	166	# Create the batch tensor
	167	# x_lengths = [len(sample) for sample in samples]
	168
	169	x_lengths = []
	170	x_seqs = np.ndarray((new_sample_size,self.seq_max_length),dtype=np.int32)
	171	y_seqs = np.ndarray((new_sample_size,self.seq_max_length),dtype=np.int32)
	172	self.x_lengths = []
	173	for i,sample in enumerate(samples):
	174	# fill with padding to align batchSize samples into one 2D list
	175	x_lengths.append(len(sample))
	176	x_seqs[i] = sample + [self.padToken] * (self.seq_max_length - len(sample))
	177
	178	for i in range(sample_size,new_sample_size):
	179	copyi = i - sample_size
	180	x_seqs[i] = x_seqs[copyi]
	181	x_lengths.append(x_lengths[copyi])
	182
	183	y_seqs[:,:-1] = x_seqs[:,1:]
	184	y_seqs[:,-1] = x_seqs[:,0]
	185	x_len_array = np.array(x_lengths)
	186
	187
	188
	189	self.x_batches = np.split(x_seqs.reshape(self.batch_size, -1), self.num_batches, 1)
	190	self.x_len_batches = np.split(x_len_array.reshape(self.batch_size, -1), self.num_batches, 1)
	191	self.y_batches = np.split(y_seqs.reshape(self.batch_size, -1), self.num_batches, 1)
	192
	193	def next_batch_dynamic(self):
	194	x,x_len, y = self.x_batches[self.pointer], self.x_len_batches[self.pointer],self.y_batches[self.pointer]
	195	self.pointer += 1
	196	return x,x_len, y
	197
	198	def next_batch(self):
	199	x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
	200	self.pointer += 1
	201	return x,y
	202
	203	def reset_batch_pointer(self):
	204	self.pointer = 0
	205
	206	@staticmethod
	207	def get_text_idx(text,vocab,max_document_length):
	208	max_document_length_without_end = max_document_length - 1
	209	text_array = []
	210	for i,x in enumerate(text):
	211	line = []
	212	if len(x) > max_document_length:
	213	x_parts = x[:max_document_length_without_end]
	214	idx = x_parts.rfind('。')
	215	if idx > -1 :
	216	x_parts = x_parts[0:idx + 1] + ']'
	217	x = x_parts
	218
	219	for j, w in enumerate(x):
	220	# if j >= max_document_length:
	221	# break
	222
	223	if (w not in vocab):
	224	w = '<unknown>'
	225	line.append(vocab[w])
	226	text_array.append(line)
	227	# else :
	228	# print w,'not exist'
	229
	230	return text_array
	231
	232	if __name__ == '__main__':
	233	base_path = './data/poem'
	234	# poem = '风急云轻鹤背寒，洞天谁道却归难。千山万水瀛洲路，何处烟飞是醮坛。是的'
	235	# idx = poem.rfind('。')
	236	# poem_part = poem[:idx + 1]
	237	w2v_file = os.path.join(base_path, "vectors_poem.bin")
	238	w2v = Word2Vec(w2v_file)
	239
	240	# vect = w2v_model['['][:10]
	241	# print(vect)
	242	#
	243	# vect = w2v_model['春'][:10]
	244	# print(vect)
	245
	246	in_file = os.path.join(base_path,'poems_edge.txt')
	247	# fr = open(in_file, "r",encoding='utf-8')
	248	# poems = fr.readlines()
	249	# fr.close()
	250	#
	251	#
	252	#
	253	# print("唐诗总数： %d"%len(poems))
	254	#
	255	# poem_ids = get_text_idx(poems,w2v.model.vocab_hash,100)
	256	# poem_ids_file = os.path.join(base_path,'poem_ids.txt')
	257	# with open(poem_ids_file, 'wb') as f:
	258	# cPickle.dump(poem_ids, f)
	259
	260	dataloader = DataLoader(base_path,20,w2v.model,'train')
	261

+122

-0

faas_requirements.txt less more

	0	absl-py==0.6.1
	1	alembic==1.0.3
	2	astor==0.7.1
	3	autopep8==1.4
	4	backcall==0.1.0
	5	bleach==3.0.2
	6	certifi==2018.10.15
	7	chardet==3.0.4
	8	cloudpickle==0.6.1
	9	cycler==0.10.0
	10	Cython==0.29
	11	dask==0.20.2
	12	decorator==4.3.0
	13	defusedxml==0.5.0
	14	entrypoints==0.2.3
	15	future==0.16.0
	16	gast==0.2.0
	17	grpcio==1.16.0
	18	h5py==2.8.0
	19	html5lib==1.0.1
	20	idna==2.7
	21	imageio==2.4.1
	22	imgaug==0.2.6
	23	ipykernel==5.1.0
	24	ipython==7.1.1
	25	ipython-genutils==0.2.0
	26	ipywidgets==7.4.2
	27	jedi==0.13.1
	28	jieba==0.39
	29	Jinja2==2.10
	30	jsonschema==2.6.0
	31	jupyter==1.0.0
	32	jupyter-client==5.2.3
	33	jupyter-console==6.0.0
	34	jupyter-core==4.4.0
	35	jupyterhub==0.8.1
	36	jupyterlab==0.31.1
	37	jupyterlab-launcher==0.10.5
	38	Keras==2.2.4
	39	Keras-Applications==1.0.6
	40	Keras-Preprocessing==1.0.5
	41	kiwisolver==1.0.1
	42	Mako==1.0.7
	43	Markdown==3.0.1
	44	MarkupSafe==1.0
	45	matplotlib==3.0.2
	46	mccabe==0.6.1
	47	mistune==0.8.4
	48	nbconvert==5.4.0
	49	nbformat==4.4.0
	50	networkx==2.2
	51	nltk==3.3
	52	notebook==5.7.0
	53	numpy==1.15.2
	54	opencv-python==3.4.3.18
	55	pamela==0.3.0
	56	pandas==0.23.4
	57	pandocfilters==1.4.2
	58	parso==0.3.1
	59	pbr==5.1.1
	60	pexpect==4.6.0
	61	pickleshare==0.7.5
	62	Pillow==5.3.0
	63	pluggy==0.7.1
	64	prometheus-client==0.4.2
	65	prompt-toolkit==2.0.7
	66	protobuf==3.6.1
	67	ptyprocess==0.6.0
	68	pycodestyle==2.4.0
	69	pycurl==7.43.0
	70	pydocstyle==2.1.1
	71	pydot==1.2.4
	72	pyflakes==2.0.0
	73	Pygments==2.2.0
	74	pygobject==3.20.0
	75	pyparsing==2.3.0
	76	python-apt==1.1.0b1+ubuntu0.16.4.2
	77	python-dateutil==2.7.3
	78	python-editor==1.0.3
	79	python-jsonrpc-server==0.0.1
	80	python-language-server==0.21.2
	81	python-oauth2==1.1.0
	82	pytz==2018.5
	83	PyWavelets==1.0.1
	84	PyYAML==3.13
	85	pyzmq==17.1.2
	86	qtconsole==4.4.2
	87	requests==2.20.0
	88	rope==0.11.0
	89	scikit-image==0.14.1
	90	scikit-learn==0.20.0
	91	scipy==1.1.0
	92	seaborn==0.9.0
	93	Send2Trash==1.5.0
	94	Shapely==1.6.4.post2
	95	simplegeneric==0.8.1
	96	six==1.11.0
	97	sklearn==0.0
	98	snowballstemmer==1.2.1
	99	SQLAlchemy==1.2.14
	100	stevedore==1.29.0
	101	tensorboard==1.11.0
	102	tensorflow==1.12.0
	103	termcolor==1.1.0
	104	terminado==0.8.1
	105	testpath==0.4.2
	106	toolz==0.9.0
	107	torch==0.4.1
	108	torchvision==0.2.1
	109	tornado==5.1.1
	110	traitlets==4.3.2
	111	urllib3==1.24.1
	112	virtualenv==16.0.0
	113	virtualenv-clone==0.4.0
	114	virtualenvwrapper==4.8.2
	115	wcwidth==0.1.7
	116	webencodings==0.5.1
	117	Werkzeug==0.14.1
	118	widgetsnbextension==3.4.2
	119	word2vec==0.10.2
	120	xlrd==1.1.0
	121	yapf==0.24.0