Merge branch 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux

* 'writeback-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux: writeback: set max_pause to lowest value on zero bdi_dirty writeback: permit through good bdi even when global dirty exceeded writeback: comment on the bdi dirty threshold fs: Make write(2) interruptible by a fatal signal writeback: Fix issue on make htmldocs
2011-12-13 14:58:56 -08:00 · 2011-12-13 14:58:56 -08:00 · 4dde6dedad
commit 4dde6dedad
parent 442ee5a942 82e230a07d
3 changed files with 37 additions and 6 deletions
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@ -156,6 +156,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 * bdi_start_writeback - start writeback
 * @bdi: the backing device to write from
 * @nr_pages: the number of pages to write
+ * @reason: reason why some writeback work was initiated
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
@ -1223,6 +1224,7 @@ static void wait_sb_inodes(struct super_block *sb)
 * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
+ * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
@ -1251,6 +1253,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
 /**
 * writeback_inodes_sb	-	writeback dirty inodes from given super_block
 * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
@ -1265,6 +1268,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 /**
 * writeback_inodes_sb_if_idle	-	start writeback if none underway
 * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
 *
 * Invoke writeback_inodes_sb if no writeback is currently underway.
 * Returns 1 if writeback was started, 0 if not.
@ -1285,6 +1289,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 * writeback_inodes_sb_if_idle	-	start writeback if none underway
 * @sb: the superblock
 * @nr: the number of pages to write
+ * @reason: reason why some writeback work was initiated
 *
 * Invoke writeback_inodes_sb if no writeback is currently underway.
 * Returns 1 if writeback was started, 0 if not.
--- a/mm/filemap.c
+++ b/mm/filemap.c
@ -2407,7 +2407,6 @@ static ssize_t generic_perform_write(struct file *file,
 						iov_iter_count(i));

 again:
-
 		/*
 		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
@ -2463,7 +2462,10 @@ static ssize_t generic_perform_write(struct file *file,
 		written += copied;

 		balance_dirty_pages_ratelimited(mapping);
-
+		if (fatal_signal_pending(current)) {
+			status = -EINTR;
+			break;
+		}
 	} while (iov_iter_count(i));

 	return written ? written : status;
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@ -411,8 +411,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 *
 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
- * And the "limit" in the name is not seriously taken as hard limit in
- * balance_dirty_pages().
+ *
+ * Note that balance_dirty_pages() will only seriously take it as a hard limit
+ * when sleeping max_pause per page is not enough to keep the dirty pages under
+ * control. For example, when the device is completely stalled due to some error
+ * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
+ * In the other normal situations, it acts more gently by throttling the tasks
+ * more (rather than completely block them) when the bdi dirty pages go high.
 *
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
@ -594,6 +599,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	 */
 	if (unlikely(bdi_thresh > thresh))
 		bdi_thresh = thresh;
+	/*
+	 * It's very possible that bdi_thresh is close to 0 not because the
+	 * device is slow, but that it has remained inactive for long time.
+	 * Honour such devices a reasonable good (hopefully IO efficient)
+	 * threshold, so that the occasional writes won't be blocked and active
+	 * writes can rampup the threshold quickly.
+	 */
 	bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
 	/*
 	 * scale global setpoint to bdi's:
@ -977,8 +989,7 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
 	 *
 	 * 8 serves as the safety ratio.
 	 */
-	if (bdi_dirty)
-		t = min(t, bdi_dirty * HZ / (8 * bw + 1));
+	t = min(t, bdi_dirty * HZ / (8 * bw + 1));

 	/*
 	 * The pause time will be settled within range (max_pause/4, max_pause).
@ -1136,6 +1147,19 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (task_ratelimit)
 			break;

+		/*
+		 * In the case of an unresponding NFS server and the NFS dirty
+		 * pages exceeds dirty_thresh, give the other good bdi's a pipe
+		 * to go through, so that tasks on them still remain responsive.
+		 *
+		 * In theory 1 page is enough to keep the comsumer-producer
+		 * pipe going: the flusher cleans 1 page => the task dirties 1
+		 * more page. However bdi_dirty has accounting errors.  So use
+		 * the larger and more IO friendly bdi_stat_error.
+		 */
+		if (bdi_dirty <= bdi_stat_error(bdi))
+			break;
+
 		if (fatal_signal_pending(current))
 			break;
 	}