-
Notifications
You must be signed in to change notification settings - Fork 444
/
Copy pathBigFile.cpp
2024 lines (1878 loc) · 67.4 KB
/
BigFile.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// JAB: this is required for pwrite() in this module
#undef _XOPEN_SOURCE
#define _XOPEN_SOURCE 500
#include "gb-include.h"
#include "BigFile.h"
#include "Dir.h"
#include "Threads.h"
#include "Stats.h"
#include "Statsdb.h"
//#include "DiskPageCache.h"
#ifdef ASYNCIO
#include <aio.h>
#endif
// main.cpp will wait for this to be zero before exiting so all unlink/renames
// can complete
int32_t g_unlinkRenameThreads = 0;
int64_t g_lastDiskReadStarted = 0LL;
int64_t g_lastDiskReadCompleted = 0LL;
bool g_diskIsStuck = false;
static void doneWrapper ( void *state , ThreadEntry *t ) ;
static bool readwrite_r ( FileState *fstate , ThreadEntry *t ) ;
BigFile::~BigFile () {
close();
}
//#define O_DIRECT 040000
BigFile::BigFile () {
//m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
m_flags = O_RDWR ; // | O_DIRECT;
m_usePartFiles = true;
// NULLify all ptrs to files
//for ( int32_t i = 0 ; i < MAX_PART_FILES ; i++ ) m_files[i] = NULL;
m_maxParts = 0;
m_numParts = 0;
//m_pc = NULL;
m_vfd = -1;
//m_vfdAllowed = false;
m_fileSize = -1;
m_lastModified = -1;
m_numThreads = 0;
m_isClosing = false;
g_lastDiskReadStarted = 0;
g_lastDiskReadCompleted = 0;
g_diskIsStuck = false;
//memset ( m_littleBuf , 0 , LITTLEBUFSIZE );
// avoid a malloc for small files.
// this way we can save in memory RdbMaps upon a core, even malloc/free
// related cores, cuz we won't have to do a malloc to save!
//m_fileBuf.setBuf ( m_littleBuf,LITTLEBUFSIZE,0,false);
// for this make the length always equal the capacity so when we
// call reserve it builds on the whole thing
//m_fileBuf.setLength ( m_fileBuf.getCapacity() );
}
// we alternate parts into "dirname" and "stripeDir"
// . return false and set g_errno on error
bool BigFile::set ( char *dir , char *baseFilename , char *stripeDir ) {
// reset filsize
m_fileSize = -1;
m_lastModified = -1;
// m_baseFilename contains the "dir" in it
//sprintf(m_baseFilename ,"%s/%s", dirname , baseFilename );
m_dir.reset();
m_baseFilename.reset();
m_dir .setLabel("bfd");
m_baseFilename.setLabel("bfbf");
m_usePartFiles = true;
// use this 32 byte char buf to avoid a malloc if possible
m_baseFilename.setBuf (m_tmpBaseBuf,32,0,false);
if ( ! m_dir.safeStrcpy ( dir ) ) return false;
if ( ! m_baseFilename.safeStrcpy ( baseFilename ) ) return false;
//strcpy ( m_baseFilename , baseFilename );
//strcpy ( m_dir , dir );
//if ( stripeDir ) strcpy ( m_stripeDir , stripeDir );
//else m_stripeDir[0] = '\0';
// reset # of parts
m_numParts = 0;
m_maxParts = 0;
m_filePtrsBuf.reset();
// now add parts from both directories
if ( ! addParts ( dir ) ) return false;
//if ( ! addParts ( m_stripeDir ) ) return false;
return true;
}
bool BigFile::reset ( ) {
// RdbMap calls BigFile (m_file)::reset() so we need to free
// the files and their safebufs for their filename and dir.
close ();
// reset filsize
m_fileSize = -1;
m_lastModified = -1;
// m_baseFilename contains the "dir" in it
//sprintf(m_baseFilename ,"%s/%s", dirname , baseFilename );
//strcpy ( m_baseFilename , baseFilename );
//strcpy ( m_dir , dir );
//if ( stripeDir ) strcpy ( m_stripeDir , stripeDir );
//else m_stripeDir[0] = '\0';
// reset # of parts
//m_numParts = 0;
//m_maxParts = 0;
// now add parts from both directories
// MDW: why is this in reset() function? remove...
//if ( ! addParts ( m_dir.getBufStart() ) ) return false;
//if ( ! addParts ( m_stripeDir ) ) return false;
return true;
}
bool BigFile::addParts ( char *dirname ) {
// if dirname is NULL return true
if ( ! dirname || ! dirname[0] ) return true;
// . now set the names of all the Files that we consist of
// . get the directory entry and find out what parts we have
Dir dir;
dir.set ( dirname );
// set our directory class
if (!dir.open()) return log("disk: openDir (\"%s\") failed",dirname);
// match files with this pattern in the directory
char pattern[256];
sprintf(pattern,"%s*", m_baseFilename.getBufStart() );
// length of the base filename
int32_t blen = gbstrlen ( m_baseFilename.getBufStart() );
// . set our m_files array
// . addFile() will return false on problems
// . the lower the fileId the older the file (w/ exception of #0)
char *filename;
while ( ( filename = dir.getNextFilename ( pattern ) ) ) {
// if filename len is exactly blen it's part 0
int32_t flen = gbstrlen(filename);
int32_t part = -1;
if ( flen == blen ) part = 0;
// some files have the same first X chars, like
// indexdb.store-info-bak but are not part files
else if ( flen > blen && strncmp(filename+blen,".part",5)!=0)
continue;
// otherwise must end in .part%i
else if (flen - blen < 6 ) {
log ("disk: Part extension too small for \"%s\". "
"Must end in .partN to be valid.",
filename);
continue;
}
else part = atoi ( filename + blen + 5 );
// ensure not too big
// if ( part >= MAX_PART_FILES ) {
// log ("disk: Part number of %"INT32" is too big for "
// "\"%s\". Should be less than %"INT32".",
// (int32_t)part,filename,(int32_t)MAX_PART_FILES);
// continue;
// }
// make this part file
if ( ! addPart ( part ) ) return false;
}
// now set the names of all our files
//for ( int32_t n = 0 ; n < MAX_PART_FILES ; n++ )
//m_files[n].set ( makeFilename ( n, m_baseFilename ) );
return true;
}
// WE CAN'T REALLOC the safebuf because there might be a thread
// referencing the file ptr. so let's just keep the m_filePtrs[] array
// and realloc on that.
bool BigFile::addPart ( int32_t n ) {
// if ( n >= MAX_PART_FILES )
// return log("disk: Part number %"INT32" > %"INT32".",
// n,(int32_t)MAX_PART_FILES);
// . grow our dynamic array and return ptr to last element
// . n's come in NOT necessarily in order!!!
int32_t need = (n+1) * sizeof(File *);
// capacity must be length always for this
if ( m_filePtrsBuf.getCapacity() != m_filePtrsBuf.getLength() ) {
char *xx=NULL;*xx=0;}
// init using tiny buf to save a malloc for small files
if ( m_filePtrsBuf.getCapacity() == 0 ) {
memset (m_tinyBuf,0,8);
m_filePtrsBuf.setBuf ( m_tinyBuf,8,0,false);
m_filePtrsBuf.setLength ( m_filePtrsBuf.getCapacity() );
}
// how much more mem do we need?
int32_t delta = need - m_filePtrsBuf.getLength();
// . make sure our CAPACITY is increased by what we need
// . SafeBuf::reserve() ADDS this much to current capacity
// . true = clear new mem new new file ptrs are null because
// there may be gaps or not exist because the BigFile was being
// merged.
if ( delta > 0 && ! m_filePtrsBuf.reserve ( delta ,"bfbuf",true ) ) {
log("file: failed to reserve %i more mem for part",delta);
return false;
}
// make length the capacity. so if buf is resized in call to
// SafeBuf::reserve() it will copy over all of the old buf to new buf
m_filePtrsBuf.setLength ( m_filePtrsBuf.getCapacity() );
File **filePtrs = (File **)m_filePtrsBuf.getBufStart();
//File *f = filesPtrs[n];
// sanity to ensure we do not breach the buffer
//char *fend = ((char *)f) + sizeof(File);
//if ( fend > m_fileBuf.getBuf() ) { char *xx=NULL;*xx=0; }
// we have to call constructor ourself then
//f->constructor();
File *f = NULL;
if ( m_numParts == 0 ) {
f = (File *)m_littleBuf;
if ( LITTLEBUFSIZE < sizeof(File) ) {
log("file: littlebufsize too small.");
char *xx=NULL;*xx=0;
}
f->constructor();
}
else {
try { f = new (File); }
catch ( ... ) {
g_errno = ENOMEM;
return log("BigFile: new(%i): %s",(int)sizeof(File),
mstrerror(g_errno));
}
mnew ( f , sizeof(File) , "BigFile" );
}
char buf[1024];
// make the filename for this new File class
makeFilename_r ( m_baseFilename.getBufStart() , NULL, n , buf , 1024 );
// and set it with that
f->set ( buf );
// store the ptr to it in m_filePtrs
filePtrs [ n ] = f;
m_numParts++;
// set maxPart
if ( n+1 > m_maxParts ) m_maxParts = n+1;
return true;
}
bool BigFile::doesExist ( ) {
return m_numParts;
}
// if we can open it with a valid fd, then it exists
bool BigFile::doesPartExist ( int32_t n ) {
//if ( n >= MAX_PART_FILES ) return false;
if ( n >= m_maxParts ) return false;
// f will be null if part does not exist
File *f = getFile2(n);
if ( f ) return true;
return false;
}
static int64_t s_vfd = 0;
// do not use part files for this open so we can open regular really >2GB
// sized files with it
// bool BigFile::open2 ( int flags ,
// void *pc ,
// int64_t maxFileSize ,
// int permissions ) {
// return open ( flags , pc , maxFileSize , permissions , false );
// }
// . override File::open so we can set m_numParts
// . set maxFileSize when opening a new file for writing and using
// DiskPageCache
// . use maxFileSize of -1 for us to use getFileSize() to set it
bool BigFile::open ( int flags ,
//class DiskPageCache *pc ,
void *pc ,
int64_t maxFileSize ,
int permissions ) {
m_flags = flags;
//m_pc = pc;
//m_permissions = permissions;
m_isClosing = false;
// this is true except when parsing big warc files
m_usePartFiles = true;//usePartFiles;
// . init the page cache for this vfd
// . this returns our "virtual fd", not the same as File::m_vfd
// . returns -1 and sets g_errno on failure
// . we pass m_vfd to getPages() and addPages()
if ( m_vfd == -1 ) {
//if ( maxFileSize == -1 ) maxFileSize = getFileSize();
m_vfd = ++s_vfd;
//g_errno = 0;
}
return true;
}
// get the filename of the nth file using m_dir/m_stripeDir & m_baseFilename
void BigFile::makeFilename_r ( char *baseFilename ,
char *baseFilenameDir ,
int32_t n ,
char *buf ,
int32_t bufSize ) {
char *dir = m_dir.getBufStart();
if ( baseFilenameDir && baseFilenameDir[0] ) dir = baseFilenameDir;
int32_t r;
// ensure we do not breach the buffer
// int32_t dirLen = gbstrlen(dir);
// int32_t baseLen = gbstrlen(baseFilename);
// int32_t need = dirLen + 1 + baseLen + 1;
// if ( need < bufSize ) { char *xx=NULL;*xx=0; }
//static char s[1024];
// if ( (n % 2) == 0 || ! m_stripeDir[0] )
// sprintf ( buf, "%s/%s", dir , baseFilename );
// else sprintf ( buf, "%s/%s", m_stripeDir, baseFilename );
if ( n == 0 ) {
r = snprintf ( buf, bufSize, "%s/%s",dir,baseFilename);
if ( r < bufSize ) return;
// truncation is bad
char *xx=NULL; *xx=0;
}
// return if it fit into "buf"
r = snprintf ( buf, bufSize, "%s/%s.part%"INT32,dir,baseFilename,n);
if ( r < bufSize ) return;
// truncation is bad
char *xx=NULL; *xx=0;
}
//int BigFile::getfdByOffset ( int64_t offset ) {
// return getfd ( offset / MAX_PART_SIZE , true /*forReading?*/ );
//}
// . get the fd of the nth file
// . will try to open the file if it hasn't yet been opened
int BigFile::getfd ( int32_t n , bool forReading ) { // , int64_t *vfd ) {
// boundary check
if ( n >= m_maxParts && ! addPart ( n ) ) {
log("disk: Part number %"INT32" > %"INT32". fd "
"not available.",
n,m_maxParts);
// return -1 to indicate can't do it
return -1;
}
// get the File ptr from the table
File *f = getFile2(n);
// if part does not exist then create it! addPart(n) will do that?
if ( ! f ) {
// don't create File if we're getting it for reading
if ( forReading ) return -1;
if ( ! addPart (n) ) return -1;
}
// open it if not opened
if ( ! f->calledOpen() ) {
if ( ! f->open ( m_flags , getFileCreationFlags() ) ) {
log("disk: Failed to open file part #%"INT32".",n);
return -1;
}
}
// set it virtual fd, too
//if ( vfd ) *vfd = f->m_vfd;
// get it's file descriptor
int fd = f->getfd ( ) ;
if ( fd >= -1 ) return fd;
// otherwise, fd is -2 and it's never been opened?!?!
g_errno = EBADENGINEER;
log(LOG_LOGIC,"disk: fd is -2.");
return -1;
}
// . return -2 on error
// . return -1 if does not exist
// . otherwise return the big file's complete file size (can be well over 2gb)
int64_t BigFile::getFileSize ( ) {
// return if already computed
if ( m_fileSize >= 0 ) return m_fileSize;
// add up the sizes of each file
int64_t totalSize = 0;
for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
// shortcut
File *f = getFile2(n);
// we can have headless big files... count the heads.
// this can happen if the first Files were deleted because
// of an ongoing merge operation.
if ( ! f ) {
totalSize += MAX_PART_SIZE;
continue;
}
// . returns -2 on error, -1 if does not exist
// . TODO: it returns 0 if does not exist! FIX...
int32_t size = f->getFileSize();
if ( size == -2 ) return -2;
if ( size == -1 ) break;
totalSize += size;
}
// save time
m_fileSize = totalSize;
return totalSize;
}
// . return -2 on error
// . return -1 if does not exist
// . otherwise returns the oldest of the last mod dates of all the part files
time_t BigFile::getLastModifiedTime ( ) {
// return if already computed
if ( m_lastModified >= 0 ) return m_lastModified;
// add up the sizes of each file
time_t min = -1;
for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
// shortcut
File *f = getFile2(n);
// we can have headless big files... count the heads
if ( ! f ) continue;
// returns -1 on error, 0 if file does not exist
time_t date = f->getLastModifiedTime();
if ( date == -1 ) return -2;
if ( date == 0 ) break;
// check min
if ( date < min || min == -1 ) min = date;
}
// save time
m_lastModified = min;
return m_lastModified;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we need a ptr to the ptr to this BigFile so if we get deleted and
// a signal is still pending for us, the callback will know we are nuked
bool BigFile::read ( void *buf ,
int32_t size ,
int64_t offset ,
FileState *fs ,
void *state ,
void (* callback)(void *state) ,
int32_t niceness ,
bool allowPageCache ,
bool hitDisk ,
int32_t allocOff ) {
g_errno = 0;
return readwrite ( buf , size , offset , false/*doWrite?*/,
fs , state, callback , niceness , allowPageCache ,
hitDisk , allocOff );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool BigFile::write ( void *buf ,
int32_t size ,
int64_t offset ,
FileState *fs ,
void *state ,
void (* callback)(void *state) ,
int32_t niceness ,
bool allowPageCache ) {
// sanity check
if ( g_conf.m_readOnlyMode ) {
logf(LOG_DEBUG,"disk: BigFile: Trying to write while in "
"read only mode.");
return true;
}
g_errno = 0;
//if ( m_pc && m_pc->m_isOverriden ) allowPageCache = false;
return readwrite ( buf , size , offset , true/*doWrite?*/ ,
fs , state, callback , niceness , allowPageCache ,
true , 0 );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we divide into 2 writes in case write spans 2 files
// . only BigFiles will support non-blocking read/writes for now
// . damn, i thought linux supported non-blocking file reads, but it doesn't!
// . we use the aio.h calls
// . we should us kaio from sgi cuz it's in the kernel and only uses 4 threads
// whereas using librt.a creates a thread every time we call aio_read/write()
// . fstate is used by aio_read/write()
// . we need a ptr to the ptr to this BigFile so if we get deleted and
// a signal is still pending for us, the callback will know we are nuked
bool BigFile::readwrite ( void *buf ,
int32_t size ,
int64_t offset ,
bool doWrite ,
FileState *fstate ,
void *state ,
void (* callback) ( void *state ) ,
int32_t niceness ,
bool allowPageCache ,
bool hitDisk ,
int32_t allocOff ) {
// are we blocking?
bool isNonBlocking = m_flags & O_NONBLOCK;
// if we're non blocking and caller didn't supply an "fstate"
if ( isNonBlocking && ! fstate ) {
g_errno = EBADENGINEER;
log(LOG_LOGIC,"disk: readwrite() call is "
"specified as non-blocking, but no state provided.");
return true;
}
// reset file size in case we change it here
if ( doWrite ) {
m_fileSize = -1;
m_lastModified = getTimeLocal();
}
// . sanity check
// . when our offset was just a int32_t 2gig+ files, when dumped,
// had negative offsets, bad engineer
if ( offset < 0 ) {
log(LOG_LOGIC,"disk: readwrite() offset is %"INT64" "
"< 0. filename=%s/%s. dumping core. try deleting "
"the .map file for it and restarting.",offset,
m_dir.getBufStart(),m_baseFilename.getBufStart());
char *xx = NULL; *xx = 0;
}
// if we're not blocking use a fake fstate
FileState tmp;
if ( ! fstate ) fstate = &tmp;
// . no error yet
// . need this up here in case it is a cache hit from a re-call
// due to a EFILECLOSED error
//fstate->m_errno = 0;
// offset to read into "buf"
int32_t bufOff = 0;
// point to start of space allocated to hold what we read. "buf"
// should be >= allocBuf + allocOff, depending on value of bufOff
char *allocBuf = NULL;
int32_t allocSize;
// reset this
fstate->m_errno = 0;
fstate->m_inPageCache = false;
// . try to get as much as we can from page cache first
// . the vfd of the big file will be the vfd of its last File class
/*
if ( ! doWrite && m_pc && allowPageCache ) {
//int32_t oldOff = offset;
// we have to set these so RdbScan doesn't freak out if we
// have it all cached and return without hitting disk
fstate->m_bytesDone = size;
fstate->m_bytesToGo = size;
// sanity
if ( m_vfd == -1 ) { char *xx=NULL;*xx=0; }
//log("getting pages off=%"INT64" size=%"INT32"",offset,size);
// now we pass in a ptr to the buf ptr, because if buf is NULL
// this will allocate one for us if it has some pages in the
// cache that we can use.
char *readBuf = m_pc->getPages ( m_vfd, offset, size );
//log("got pages off=%"INT64" size=%"INT32"",offset,size);
//bufOff = offset - oldOff;
// comment out for test
if ( readBuf ) {
// let caller/RdbScan know about the newly alloc'd buf
fstate->m_buf = (char *)readBuf;
fstate->m_allocBuf = readBuf;
fstate->m_allocSize = size;
fstate->m_allocOff = 0;
fstate->m_inPageCache = true;
return true;
}
// check
//if ( m_pc->m_isOverriden && size < 0 ) {
// fstate->m_bytesDone += size;
// fstate->m_bytesToGo += size;
// return true;
//}
}
*/
// sanity check. if you set hitDisk to false, you must allow
// us to check the page cache! silly bean!
if ( ! allowPageCache && ! hitDisk ) { char*xx=NULL;*xx=0; }
//if ( m_pc && m_pc->m_isOverriden )
// log ( LOG_INFO, "bigfile: HITTING DISK!! %"INT32"",
// (int32_t)allowPageCache );
// set up fstate
fstate->m_this = this;
// buf may be NULL if caller passed in a NULL "buf" and it did not hit
// the disk page cache. Threads.cpp will have to allocate it right
// before it launches the thread.
fstate->m_buf = (char *)buf + bufOff;
// if getPages() allocates a buf, this will point to it
fstate->m_allocBuf = allocBuf;
fstate->m_allocSize = allocSize;
// when buf is passed in as NULL we allocate it in Threads.cpp right
// before we launch it to save memory. it may also be allocated in
// DiskPageCache.cpp. we have to know where to start storing
// the read into it for RdbScan, it is not immediately at the
// beginning of the allocated buffer because RdbScan may have to
// turn the first key from a 6 byte half key into a 12 byte key so it
// needs some initial padding. this is because RdbLists should never
// start with a 6 byte half key.
fstate->m_allocOff = allocOff;
fstate->m_bytesToGo = size;
fstate->m_offset = offset;
fstate->m_doWrite = doWrite;
fstate->m_bytesDone = 0;
fstate->m_state = state;
fstate->m_callback = callback;
fstate->m_niceness = niceness;
fstate->m_flags = m_flags;
fstate->m_usePartFiles = m_usePartFiles;
// sanity
if ( fstate->m_bytesToGo > 150000000 )
log("file: huge read of %"INT64" bytes",(int64_t)size);
// . set our fd's before entering the thread in case RdbMerge
// calls our unlinkPart()
// . it's thread-UNsafe to call getfd() from within the thread
// . FUCK! what if we get unlinked and another file gets this fd!!
// . now we do do unlinks in a thread in File.cpp, but since we
// employ the getCloseCount_r() scheme we can detect when this
// situation occurs and pass a g_errno back to the caller.
fstate->m_filenum1 = offset / MAX_PART_SIZE;
fstate->m_filenum2 = (offset + size ) / MAX_PART_SIZE;
// if not really a big file. we use this for parsing huge warc files
if ( ! m_usePartFiles ) {
fstate->m_filenum1 = 0;
fstate->m_filenum2 = 0;
}
// . save the open count for this fd
// . if it changes when we're done with the read we do a re-read
// . it gets incremented once every time File calls ::open and gets
// back this fd
// . fd1 and fd1 are now set in Threads.cpp since we only want to do
// the open right before we actually launch the thread.
//fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite ,
// &fstate->m_vfd1);
//fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite ,
// &fstate->m_vfd2);
fstate->m_fd1 = -3;
fstate->m_fd2 = -3;
// fstate->m_vfd1 = -3;
// fstate->m_vfd2 = -3;
// . if we are writing, prevent these fds from being closed on us
// by File::closedLeastUsed(), because the fd could then be re-opened
// by someone else doing a write and we end up writing to THAT FILE!
// . the closeCount mechanism helps us DETECT when something like this
// happens, but it will not prevent the write from going through
if ( doWrite ) {
// actually have to do the open here for writing so it
// can prevent the fds from being closed on us
fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite);
fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite);
//File *f1 = m_files [ fstate->m_filenum1 ];
//File *f2 = m_files [ fstate->m_filenum2 ];
enterWriteMode( fstate->m_fd1 );
enterWriteMode( fstate->m_fd2 );
fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
}
// get the close counts after calling getfd() since if getfd() calls
// File::open() that will inc the counts
// closeCount1 and 2 are now set in Threads.cpp since we want to only
// open the fd right before we launch the thread.
//fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
//fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
fstate->m_errno = 0;
fstate->m_errno2 = 0;
fstate->m_startTime = gettimeofdayInMilliseconds();
//fstate->m_pc = NULL;//m_pc;
// if ( ! allowPageCache )
// fstate->m_pc = NULL;
fstate->m_vfd = m_vfd;
// if hitDisk was false we only check the page cache!
if ( ! hitDisk ) return true;
// if disk stuck, forget about it! but make the spider disk reads
// wait until it is unstuck. just don't want to screw up the queries..
if ( g_diskIsStuck && niceness == 0 && ! doWrite ) {
g_errno = fstate->m_errno = EDISKSTUCK;
return true;
}
int32_t saved;
// . if we're blocking then do it now
// . this should return false and set g_errno on error, true otherwise
if ( ! isNonBlocking ) goto skipThread;
if ( g_threads.m_disabled ) goto skipThread;
if ( ! g_conf.m_useThreads ) goto skipThread;
#ifdef ASYNCIO
goto skipThread;
#endif
// . otherwise, spawn a thread to do this i/o
// . this returns false and sets g_errno on error, true on success
// . we should return false cuz we blocked
// . thread will add signal to g_loop on completion to call
if ( g_threads.call ( DISK_THREAD/*threadType*/, niceness , fstate ,
doneWrapper , readwriteWrapper_r) ) return false;
saved = g_errno;
// note it
if ( g_errno ) {
static time_t s_time = 0;
time_t now = getTime();
if ( now - s_time > 5 ) {
log (LOG_INFO,"disk: Thread call failed: %s.",
mstrerror(g_errno));
s_time = now;
}
}
// sanity check
if ( ! callback ) { char *xx = NULL; *xx = 0; }
// NOW we return on error because if we already have 5000 disk threads
// queued up, what is the point in blocking ourselves off? that makes
// us look like a dead host and very unresponsive. As int32_t as this
// request originated through Multicast, then multicast will sleep
// and retry. Msg3 could retry, the multicast thing should be more
// for running out of udp slots though...
// crap, call to clone() now fails a lot since we use pthreads
// library ... so assume that is it i guess (MDW 3/15/2014)
//if ( g_errno && ! doWrite && g_errno != ENOTHREADSLOTS ) {
// log (LOG_INFO,"disk: May retry later.");
// return true;
//}
// otherwise, thread spawn failed, do it blocking then
g_errno = 0;
// if threads are manually disabled don't print these msgs because
// we redbox the fact above the controls in Pages.cpp
if ( saved ) { // g_conf.m_useThreads && ! g_threads.m_disabled ) {
static int32_t s_lastTime = 0;
int32_t now = getTime();
if ( now - s_lastTime >= 1 ) {
s_lastTime = now;
log (LOG_INFO,
"disk: Doing blocking disk access. "
//"This will hurt "
//"performance. "
"isWrite=%"INT32". (%s)",(int32_t)doWrite,
mstrerror(saved));
}
}
// come here if we haven't spawned a thread
skipThread:
// if there was no room in the thread queue, then we must do this here
fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite );
fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite );
fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
// clear g_errno from the failed thread spawn
g_errno = 0;
// since Threads.cpp usually allocs the buffer before launching,
// we must do it here now
FileState *fs = fstate;
if ( ! fs->m_doWrite && ! fs->m_buf && fs->m_bytesToGo > 0 ) {
int32_t need = fs->m_bytesToGo + fs->m_allocOff;
char *p = (char *) mmalloc ( need , "ThreadReadBuf" );
if ( p ) {
fs->m_buf = p + fs->m_allocOff;
fs->m_allocBuf = p;
fs->m_allocSize = need;
}
else
log("disk: read buf alloc failed for %"INT32" "
"bytes.",need);
}
//
// pthread_create() is abhorently slow. use asyncio if possible.
//
#ifdef ASYNCIO
// we only have two in the array... most likely though we only
// need one here...
aiocb *a0 = &fstate->m_aiocb[0];
aiocb *a1 = &fstate->m_aiocb[1];
// init them for the read
a0->aio_fildes = fstate->m_fd1;
a1->aio_fildes = fstate->m_fd2;
// the offset of each file
int64_t off1 = fs->m_offset;
// always read at start of 2nd file
int64_t off2 = 0;
// how many bytes to read from each file?
int64_t readSize1 = size;
int64_t readSize2 = 0;
if ( off1 + readSize1 > MAX_PART_SIZE && m_usePartFiles ) {
readSize1 = ((int64_t)MAX_PART_SIZE) - off1;
readSize2 = size - readSize1;
}
a0->aio_offset = off1;
a1->aio_offset = off2;
a0->aio_nbytes = readSize1;
a1->aio_nbytes = readSize2;
a0->aio_buf = fstate->m_buf;
a1->aio_buf = fstate->m_buf + readSize1;
a0->aio_reqprio = 0;
a1->aio_reqprio = 0;
a0->aio_sigevent = SIGEV_SIGNAL;
a1->aio_sigevent = SIGEV_SIGNAL;
// translate offset to a filenum and offset
int32_t filenum = offset / MAX_PART_SIZE;
int32_t localOffset = offset % MAX_PART_SIZE;
if ( ! m_usePartFiles ) {
filenum = 0;
localOffset = offset;
}
// read or write?
if ( doWrite ) a0->aio_lio_opcode = LIO_WRITE;
else a0->aio_lio_opcode = LIO_READ;
// different fds implies two different files we gotta read from.
int32_t numFilesToReadFrom = 1;
if ( fstate->m_fd1 != fstate->m_fd2 ) numFilesToReadFrom = 2;
// set it up
//aioList->m_signal = ESIG;
retry77:
//
// don't use this on kernels below 3.12 because it can block
// when reading ext4 files.
//
io_submit();
// this will send the signal when read/write is completed
//int32_t status = lio_listio ( LIO_NOWAIT ,
// a0 ,
// numFilesToReadFrom ,
// &fstate->m_sigEvent );
// if status is 0, there was no error
if ( status == 0 ) {
g_errno = 0;
// assume we will get the signal later
return false;
}
// got interrupted by a signal? try again.
if ( errno == EINTR )
goto retry77;
// tell caller about the error
g_errno = errno;
log("aio: %s", mstrerror(g_errno));
// we did not block or anything
return true;
#endif
// . this returns false and sets errno on error
// . set g_errno to the errno
if ( ! readwrite_r ( fstate , NULL ) ) g_errno = errno;
// exit write mode
if ( doWrite ) {
//File *f1 = m_files [ fstate->m_filenum1 ];
//File *f2 = m_files [ fstate->m_filenum2 ];
//f1->exitWriteMode();
//f2->exitWriteMode();
exitWriteMode( fstate->m_fd1 );
exitWriteMode( fstate->m_fd2 );
}
// set this up here
fstate->m_bytesDone = fstate->m_bytesToGo;
// and this too
fstate->m_doneTime = gettimeofdayInMilliseconds();
// if it read less than 8MB/s bitch
int64_t now = gettimeofdayInMilliseconds() ;
int64_t took = now - fstate->m_startTime ;
int32_t rate = 100000;
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
if ( rate < 8000 && fstate->m_niceness <= 0 ) {
log(LOG_INFO,"disk: Read %"INT64" bytes in %"INT64" "
"ms (%"INT32"KB/s).",
fstate->m_bytesDone,took,rate);
g_stats.m_slowDiskReads++;
}
// default graph color is black
int color = 0x00000000;
char *label = "disk_read";
// use red for writes, though
if ( fstate->m_doWrite ) {
color = 0x00ff0000;
label = "disk_write";
}
// but gray for low priority reads
else if ( fstate->m_niceness > 0 ) color = 0x00808080;
// add the stat
g_stats.addStat_r ( fstate->m_bytesDone ,
fstate->m_startTime ,
now ,
//label ,
color );
// add to statsdb as well
//g_statsdb.addStat ( fstate->m_niceness,
// label,
// fstate->m_startTime,
// now,
// fstate->m_bytesDone);
// store read/written pages into page cache
// if ( ! g_errno && fstate->m_pc )
// fstate->m_pc->addPages ( fstate->m_vfd ,
// fstate->m_offset ,
// fstate->m_bytesDone ,
// fstate->m_buf ,
// fstate->m_niceness );
// now log our stuff here
if ( g_errno && g_errno != EBADENGINEER )
log("disk: readwrite: %s", mstrerror(g_errno));
// . this EBADENGINEER can happen right after a merge if
// the file is renamed because the fd may have changed from
// under us
// . i added EBADF because RbdDump was failing because of this when
// trying to write the tree to a file
// . EBADF happens when we unlink a file from under a read or write
// . the closeCount code below was not saving us from coring on EBADF
// because the closeCount is only changed if another file is opened
// with that fd, it is not incremented on a close() but rather on
// an open()
/*
if ( g_errno == EBADENGINEER ) { // || g_errno == EBADF ) {
int32_t fn1 = fstate->m_filenum1;
int32_t fn2 = fstate->m_filenum2;
char *s = getFilename();
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%"INT32")",s,fn1);
log(LOG_DEBUG,"disk: Closing old fd2 (%s,%"INT32")",s,fn2);
// get the File ptr from the table
File *f1 = getFile(fn1);
File *f2 = getFile(fn2);
if ( f2 == f1 ) f2 = NULL;
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%"INT32")",s,fn1);
if ( f2) log(LOG_DEBUG,"disk: Closing old fd2 (%s,%"INT32")",s,fn2);
if ( f1 ) f1->close();
if ( f2 ) f2->close();
}
*/
// we didn't block so return true
return true;
}
// . this should be called from the main process after getting our call OUR callback here
void doneWrapper ( void *state , ThreadEntry *t ) {
FileState *fstate = (FileState *)state;
// any writes we did in the disk read thread were done to the
// "tmp" FileState class on the stack, so now we have the real deal
// we can update all this junk.
fstate->m_bytesDone = fstate->m_bytesToGo;
fstate->m_doneTime = t->m_exitTime; // set in Threads.cpp
fstate->m_errno = t->m_errno;
// exit write mode
if ( fstate->m_doWrite ) {
// THIS could have been deleted!!
//BigFile *THIS = fstate->m_this;
//File *f1 = THIS->m_files [ fstate->m_filenum1 ];
//File *f2 = THIS->m_files [ fstate->m_filenum2 ];
//f1->exitWriteMode();
//f2->exitWriteMode();
exitWriteMode( fstate->m_fd1 );
exitWriteMode( fstate->m_fd2 );
}
// if it read less than 8MB/s bitch
int64_t took = fstate->m_doneTime - fstate->m_startTime;
int32_t rate = 100000;
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
bool slow = false;
if ( rate < 8000 ) slow = true;
if ( fstate->m_errno == EDISKSTUCK ) slow = true;
if ( slow && fstate->m_niceness <= 0 ) {
if ( fstate->m_errno != EDISKSTUCK )
log(LOG_INFO, "disk: Read %"INT64" bytes in %"INT64" "
"ms (%"INT32"KB/s).",
fstate->m_bytesDone,took,rate);
g_stats.m_slowDiskReads++;
}
// get the BigFIle
//BigFile *THIS = fs->m_this;
// recall g_errno from state's m_errno
g_errno = fstate->m_errno;
// might have had the file renamed/unlinked from under us
if ( ! g_errno ) g_errno = fstate->m_errno2;
// fstate has his own m_pc in case BigFile got deleted, we cannot
// reference it...
// if ( ! g_errno && fstate->m_pc )
// fstate->m_pc->addPages ( fstate->m_vfd ,